├── tips
├── foreach
│ ├── README.md
│ ├── IngestTable.dbc
│ ├── ListaTables.dbc
│ ├── Ambiente Job Foreach.dbc
│ └── Job Foreach - Ingestion Tables.json
├── regex
│ ├── README.md
│ ├── regexteste.py
│ └── regex.py
├── feliznatal
│ ├── README.md
│ └── feliznatal.sql
├── VNET
│ ├── Databricks.jpg
│ └── README.md
├── count
│ ├── README.md
│ ├── 00000000000000000001.json
│ ├── Count(_) vs Count(1).sql
│ ├── 00000000000000000002.json
│ └── 00000000000000000000.json
├── markdown
│ ├── README.md
│ ├── Magic Commands.sql
│ └── OptimizeAndVacuum_Doc.py
├── parallel
│ ├── README.md
│ └── Paralelismo.py
├── run
│ ├── README.md
│ ├── notebook2.py
│ ├── notebook3.py
│ └── notebook1.py
├── widgets
│ ├── README.md
│ └── Widgets.py
├── dbutils
│ ├── README.md
│ └── Dbutils-Dataframe.py
├── DatabricksSDKPython
│ ├── README.md
│ └── Python-SDK.py
├── input_file_name
│ ├── README.md
│ ├── generate json.py
│ └── bronze_demo.py
├── DatabricksServicePrincipal
│ ├── README.md
│ └── Generate ServicePrincipal Token.py
├── SHOW
│ ├── README.md
│ └── SHOW COMMANDs.sql
├── logicapp
│ ├── README.md
│ ├── TableLogicApps.sql
│ └── logicapp.json
├── deltaTable
│ ├── README.md
│ └── Protocols.sql
├── parameters
│ ├── README.md
│ └── RunDevProd.py
├── sparkconfs
│ ├── README.md
│ └── Spark Confs.py
├── EXPLODE_STRING
│ ├── README.md
│ └── Explode usando SQL.py
├── VacuumInventory
│ ├── README.md
│ └── Vacuum Inventory.py
├── deletionVector
│ ├── README.md
│ └── DeletionVectors.py
├── parquetvsdelta
│ ├── README.md
│ └── Delta vs Parquet.sql
├── particionamento
│ ├── README.md
│ └── Particionar ou Nao_.sql
├── timeTravelVsCDF
│ ├── README.md
│ └── Time Travel vs Change Data Feed.sql
├── Table lineage
│ ├── README.md
│ ├── Usabilidade por usuario.sql
│ ├── Usabilidade por dia.sql
│ └── Usabilidade das tabelas.sql
├── System Tables
│ └── ScriptSQL.sql
├── UpgradeMethods
│ ├── README.md
│ └── UpgradeUC_Examples.sql
└── DatabricksAcademy
│ └── Learning.md
├── routines
├── tablesSize&Vacuum
│ ├── TablesSize&Vacuum.dbc
│ └── README.md
└── OptimizeAndVacuum
│ ├── README.md
│ ├── Demo.sql
│ └── OptimizeAndVacuum.py
├── API
└── databricks
│ ├── README.md
│ ├── API SQL Statement.py
│ ├── Databrick Jobs List - API.py
│ └── Databricks API - Clusters.py
└── README.md
/tips/foreach/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tips/regex/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tips/feliznatal/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tips/VNET/Databricks.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/reginaldosilva27/Databricks/HEAD/tips/VNET/Databricks.jpg
--------------------------------------------------------------------------------
/tips/foreach/IngestTable.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/reginaldosilva27/Databricks/HEAD/tips/foreach/IngestTable.dbc
--------------------------------------------------------------------------------
/tips/foreach/ListaTables.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/reginaldosilva27/Databricks/HEAD/tips/foreach/ListaTables.dbc
--------------------------------------------------------------------------------
/tips/foreach/Ambiente Job Foreach.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/reginaldosilva27/Databricks/HEAD/tips/foreach/Ambiente Job Foreach.dbc
--------------------------------------------------------------------------------
/routines/tablesSize&Vacuum/TablesSize&Vacuum.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/reginaldosilva27/Databricks/HEAD/routines/tablesSize&Vacuum/TablesSize&Vacuum.dbc
--------------------------------------------------------------------------------
/tips/count/README.md:
--------------------------------------------------------------------------------
1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
2 |
3 | Selecionar o opção Import
4 |
5 |
6 | Selecionar o script e importar:
7 |
8 |
--------------------------------------------------------------------------------
/tips/markdown/README.md:
--------------------------------------------------------------------------------
1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
2 |
3 | Selecionar o opção Import
4 |
5 |
6 | Selecionar o script e importar:
7 |
8 |
--------------------------------------------------------------------------------
/tips/parallel/README.md:
--------------------------------------------------------------------------------
1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
2 |
3 | Selecionar o opção Import
4 |
5 |
6 | Selecionar o script e importar:
7 |
8 |
--------------------------------------------------------------------------------
/tips/run/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
3 |
4 | Selecionar o opção Import
5 |
6 |
7 | Selecionar o script e importar:
8 |
9 |
--------------------------------------------------------------------------------
/tips/widgets/README.md:
--------------------------------------------------------------------------------
1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
2 |
3 | Selecionar o opção Import
4 |
5 |
6 | Selecionar o script e importar:
7 |
8 |
--------------------------------------------------------------------------------
/tips/dbutils/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
3 |
4 | Selecionar o opção Import
5 |
6 |
7 | Selecionar o script e importar:
8 |
9 |
--------------------------------------------------------------------------------
/tips/DatabricksSDKPython/README.md:
--------------------------------------------------------------------------------
1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
2 |
3 | Selecionar o opção Import
4 |
5 |
6 | Selecionar o script e importar:
7 |
8 |
--------------------------------------------------------------------------------
/tips/input_file_name/README.md:
--------------------------------------------------------------------------------
1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
2 |
3 | Selecionar o opção Import
4 |
5 |
6 | Selecionar o script e importar:
7 |
8 |
--------------------------------------------------------------------------------
/tips/DatabricksServicePrincipal/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
3 |
4 | Selecionar o opção Import
5 |
6 |
7 | Selecionar o script e importar:
8 |
9 |
--------------------------------------------------------------------------------
/API/databricks/README.md:
--------------------------------------------------------------------------------
1 | Scripts sobre Databricks API.
2 |
3 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
4 |
5 | Selecionar o opção Import
6 |
7 |
8 | Selecionar o script e importar:
9 |
10 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/run/notebook2.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Sessao do Spark
3 | # MAGIC %scala
4 | # MAGIC spark
5 |
6 | # COMMAND ----------
7 |
8 | # DBTITLE 1,Contexto do Spark
9 | # MAGIC %scala
10 | # MAGIC spark.sparkContext
11 |
12 | # COMMAND ----------
13 |
14 | print('ola mund0 - notebook2 aqui')
15 |
16 | # COMMAND ----------
17 |
18 | # DBTITLE 1,Mostrando variaveis do Notebook1
19 | #Usando Argument
20 | print(getArgument("dataini"))
21 |
22 | #Usando Widgets
23 | print(dbutils.widgets.get("datafim"))
24 |
--------------------------------------------------------------------------------
/tips/run/notebook3.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | print('ola mund0 - notebook3 aqui')
3 |
4 | # COMMAND ----------
5 |
6 | # DBTITLE 1,Definindo váriaveis com retorno de uma query
7 | dataini = spark.sql("select dataini from tb_parameters")[0][0]
8 | datafim = spark.sql("select datafim from tb_parameters")[0][0]
9 |
10 | # COMMAND ----------
11 |
12 | # DBTITLE 1,Mostrando variaveis do Notebook1
13 | #Usando Argument
14 | print(getArgument("dataini"))
15 |
16 | #Usando Widgets
17 | print(dbutils.widgets.get("datafim"))
18 |
--------------------------------------------------------------------------------
/tips/SHOW/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/logicapp/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/deltaTable/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/parameters/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/sparkconfs/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/EXPLODE_STRING/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/VacuumInventory/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/deletionVector/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/parquetvsdelta/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/particionamento/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/timeTravelVsCDF/README.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | Notebooks e dicas sobre Databricks
3 |
4 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
5 |
6 | Selecionar o opção Import
7 |
8 |
9 | Selecionar o script e importar:
10 |
11 |
--------------------------------------------------------------------------------
/tips/Table lineage/README.md:
--------------------------------------------------------------------------------
1 | # Monitoramento de usabilidade das tabelas no Unity Catalog
2 |
3 |
4 | ## Monitore tabelas mais usadas no ambiente
5 |
6 |
7 | ## Mapeamento das tabelas não utilizadas, tabelas com muitas escritas e poucas leituras
8 |
9 |
--------------------------------------------------------------------------------
/tips/input_file_name/generate json.py:
--------------------------------------------------------------------------------
1 | import json
2 | from faker import Faker
3 | import random
4 |
5 | fake = Faker()
6 |
7 | for i in range(1, 11):
8 | data = [{
9 | 'name': fake.name(),
10 | 'address': fake.address(),
11 | 'email': fake.email(),
12 | 'phone_number': fake.phone_number(),
13 | 'job': fake.job(),
14 | 'age': random.randint(18, 65),
15 | 'company': fake.company(),
16 | 'credit_card_number': fake.credit_card_number(),
17 | 'date_joined': str(fake.date_this_decade())
18 | }]
19 |
20 | with open(f'/Users/reginaldosilva/Documents/Jsons/data{i}.json', 'w') as f:
21 | json.dump(data, f, indent=4)
22 |
--------------------------------------------------------------------------------
/tips/logicapp/TableLogicApps.sql:
--------------------------------------------------------------------------------
1 |
2 | -- Table to log events
3 | drop table tb_OrchestratorEvents;
4 | CREATE TABLE tb_OrchestratorEvents
5 | (
6 | id int IDENTITY PRIMARY KEY,
7 | jobName VARCHAR(200),
8 | jobId VARCHAR(200),
9 | databricksWorkspace VARCHAR(200),
10 | emailList VARCHAR(MAX),
11 | subject VARCHAR(MAX),
12 | customBody VARCHAR(MAX),
13 | dateLog datetime
14 | )
15 |
16 | -- Generate Event
17 | INSERT INTO tb_OrchestratorEvents VALUES (
18 | 'Job1-Teste',
19 | '981175440018532',
20 | 'https://adb-4013955633331914.14.azuredatabricks.net/api/2.1/jobs/run-now',
21 | 'reginaldo.silva@dataside.com.br',
22 | 'LogicApp - Item criado na tabela tb_OrchestratorEvents',
23 | 'Event Information: Job Run created ',
24 | GETDATE()
25 | )
26 |
27 | --Events
28 | select * from tb_OrchestratorEvents
29 |
--------------------------------------------------------------------------------
/tips/markdown/Magic Commands.sql:
--------------------------------------------------------------------------------
1 | -- Databricks notebook source
2 | -- DBTITLE 1,SQL - Default desse notebook
3 | -- MAGIC %sql
4 | -- MAGIC -- Não preciso especificar, mas se quiser voce pode
5 | -- MAGIC SELECT 'usando linguagem SQL'
6 |
7 | -- COMMAND ----------
8 |
9 | -- DBTITLE 1,Python
10 | -- MAGIC %python
11 | -- MAGIC var = "Opa, agora to no python!"
12 | -- MAGIC print(var)
13 |
14 | -- COMMAND ----------
15 |
16 | -- DBTITLE 1,Shell script
17 | -- MAGIC %sh
18 | -- MAGIC ls -l
19 |
20 | -- COMMAND ----------
21 |
22 | -- DBTITLE 1,Scala
23 | -- MAGIC %scala
24 | -- MAGIC val var = "Vai de scala?"
25 | -- MAGIC println(var)
26 |
27 | -- COMMAND ----------
28 |
29 | -- DBTITLE 1,R
30 | -- MAGIC %r
31 | -- MAGIC var <- "R é para os bruxos!"
32 | -- MAGIC print(var)
33 |
34 | -- COMMAND ----------
35 |
36 | -- DBTITLE 1,Markdown
37 | -- MAGIC %md
38 | -- MAGIC ## Esse é o tema do post
39 | -- MAGIC Vamos falar mais sobre Markdown
40 |
41 | -- COMMAND ----------
42 |
43 | -- DBTITLE 1,FS
44 | -- MAGIC %fs
45 | -- MAGIC ls /
46 |
47 | -- COMMAND ----------
48 |
49 | -- DBTITLE 1,run - chamando notebooks
50 | -- MAGIC %run /maintenanceDeltalake
51 |
--------------------------------------------------------------------------------
/tips/parquetvsdelta/Delta vs Parquet.sql:
--------------------------------------------------------------------------------
1 | -- Databricks notebook source
2 | -- DBTITLE 1,Criando a tabela DEMO
3 | -- MAGIC %py
4 | -- MAGIC df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv')
5 | -- MAGIC df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDelta",path='abfss://xxx@xxx.dfs.core.windows.net/bronze/PatientInfoDelta')
6 | -- MAGIC df.display()
7 |
8 | -- COMMAND ----------
9 |
10 | SET spark.databricks.delta.formatCheck.enabled=false
11 |
12 | -- COMMAND ----------
13 |
14 | -- DBTITLE 1,Lendo Parquets
15 | select * from parquet.`abfss://xxx@xxx.dfs.core.windows.net/bronze/PatientInfoDelta/*.parquet`
16 |
17 | -- COMMAND ----------
18 |
19 | -- DBTITLE 1,Executando 1 Update na tabela Delta
20 | -- Atualizando 1 registro
21 | update db_demo.PatientInfoDelta set age = '33s' where patient_id = '1000000001';
22 | select * from delta.`abfss://xxx@xxx.dfs.core.windows.net/bronze/PatientInfoDelta/`;
23 |
24 | -- COMMAND ----------
25 |
26 | -- DBTITLE 1,Lendo Parquets das tabelas Delta
27 | select * from parquet.`abfss://xxx@xxx.dfs.core.windows.net/bronze/PatientInfoDelta/*.parquet`
28 |
29 | -- COMMAND ----------
30 |
31 | -- DBTITLE 1,Lendo parquets - spark.databricks.delta.formatCheck.enabled
32 | SET spark.databricks.delta.formatCheck.enabled=true;
33 | select * from parquet.`abfss://xxx@xxx.dfs.core.windows.net/bronze/PatientInfoDelta/*.parquet`
34 |
--------------------------------------------------------------------------------
/tips/regex/regexteste.py:
--------------------------------------------------------------------------------
1 | import re
2 | log_string = """
3 | 04:06:58 3 of 68 OK created sql incremental model bronze.vendors [INSERT 0 2 in 6.70s]
4 | """
5 | pattern1 = r"(\d{2}:\d{2}:\d{2})\s" # hora
6 | pattern2 = r"(\d+)\s+of\s" # id
7 | pattern3 = r"OK\s+created\s+sql\s+(\w+)\s+model\s" # tipo
8 | pattern4 = r"(\d+\.\d+)s\]" # tabela
9 | pattern5 = r"(\d{2}:\d{2}:\d{2})\s+(\d+)\s+of\s+\d+\s+OK\s+created\s+sql\s+(\w+)\s+model\s+([\w\.]+)\s+.*?\[.*?in\s+(\d+\.\d+)s\]" # todas colunas
10 |
11 | print("--------------------------------------")
12 | print(re.search(pattern1, log_string))
13 | print(re.search(pattern1, log_string).group(1))
14 | print("--------------------------------------")
15 | print(re.search(pattern2, log_string))
16 | print(re.search(pattern2, log_string).group(1))
17 | print("--------------------------------------")
18 | print(re.search(pattern3, log_string))
19 | print(re.search(pattern3, log_string).group(1))
20 | print("--------------------------------------")
21 | print(re.search(pattern4, log_string))
22 | print(re.search(pattern4, log_string).group(1))
23 | print("--------------------------------------")
24 | print(re.search(pattern5, log_string))
25 | print(re.search(pattern5, log_string).group(1))
26 | print(re.search(pattern5, log_string).group(2))
27 | print(re.search(pattern5, log_string).group(3))
28 | print(re.search(pattern5, log_string).group(4))
29 | print(re.search(pattern5, log_string).group(5))
--------------------------------------------------------------------------------
/tips/feliznatal/feliznatal.sql:
--------------------------------------------------------------------------------
1 | -- Letra F
2 | SELECT CAST ('POLYGON((1 10, 1 11, 2 11, 2 10.8, 1.25 10.8, 1.25 10.6, 1.75 10.6, 1.75 10.4, 1.25 10.4, 1.25 10, 1 10))' as geometry)
3 | UNION ALL
4 | -- Letra E
5 | SELECT CAST ('POLYGON((2 10, 2 11, 3 11, 3 10.8, 2.25 10.8, 2.25 10.6, 2.75 10.6, 2.75 10.4, 2.25 10.4, 2.25 10.2, 3 10.2, 3 10, 2 10))' as geometry)
6 | UNION ALL
7 | -- Letra L
8 | SELECT CAST ('POLYGON((3.15 11, 3.15 10, 3.85 10, 3.85 10.2, 3.35 10.2, 3.35 11, 3.15 11))' as geometry)
9 | UNION ALL
10 | -- Letra I
11 | SELECT CAST ('POLYGON((4.2 11, 4.8 11, 4.8 10.8, 4.6 10.8, 4.6 10.2, 4.8 10.2, 4.8 10, 4.2 10, 4.2 10.2, 4.4 10.2, 4.4 10.8, 4.2 10.8, 4.2 11))' as geometry)
12 | UNION ALL
13 | -- Letra Z
14 | SELECT CAST ('POLYGON((5 11, 6 11, 5.4 10.2, 6 10.2, 6 10, 5 10, 5.6 10.8, 5 10.8, 5 11))' as geometry)
15 | UNION ALL
16 | -- Letra N
17 | SELECT CAST ('POLYGON((1 10, 1 9, 1.2 9, 1.2 9.8, 1.8 9, 2 9, 2 10, 1.8 10, 1.8 9.3, 1.3 10, 1 10))' as geometry)
18 | UNION ALL
19 | -- Letra A
20 | SELECT CAST ('POLYGON((2 9, 2 10, 3 10, 3 9, 2.75 9, 2.75 9.3, 2.25 9.3, 2.25 9, 2 9),(2.25 9.5, 2.25 9.8, 2.75 9.8, 2.75 9.5, 2.25 9.5))' as geometry)
21 | UNION ALL
22 | -- Letra T
23 | SELECT CAST ('POLYGON((3 9.8, 3 10,4 10, 4 9.8, 3.6 9.8, 3.6 9, 3.4 9, 3.4 9.8, 3 9.8))' as geometry)
24 | UNION ALL
25 | -- Letra A
26 | SELECT CAST ('POLYGON((4 9, 4 10, 5 10, 5 9, 4.75 9, 4.75 9.3, 4.25 9.3, 4.25 9, 4 9),(4.25 9.5, 4.25 9.8, 4.75 9.8, 4.75 9.5, 4.25 9.5))' as geometry)
27 | UNION ALL
28 | -- Letra L
29 | SELECT CAST ('POLYGON((5.15 10, 5.15 9, 5.85 9, 5.85 9.2, 5.35 9.2, 5.35 10, 5.15 10))' as geometry)
--------------------------------------------------------------------------------
/tips/input_file_name/bronze_demo.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Listando arquivos na camada Landing
3 | # MAGIC %fs
4 | # MAGIC ls abfss://datalake@storageunitycatalogdemo.dfs.core.windows.net/landing
5 |
6 | # COMMAND ----------
7 |
8 | # DBTITLE 1,Lendo arquivos JSON
9 | # MAGIC %py
10 | # MAGIC df = spark.read.option("multiLine", "True").json('abfss://datalake@storageunitycatalogdemo.dfs.core.windows.net/landing/*.json')
11 | # MAGIC df.display()
12 |
13 | # COMMAND ----------
14 |
15 | # DBTITLE 1,Criando uma tabela Delta
16 | # MAGIC %py
17 | # MAGIC df.write.format('delta') \
18 | # MAGIC .mode('overwrite') \
19 | # MAGIC .saveAsTable("db_demo.person",path='abfss://datalake@storageunitycatalogdemo.dfs.core.windows.net/bronze/person')
20 |
21 | # COMMAND ----------
22 |
23 | # DBTITLE 1,Como saber de qual arquivo veio cada pessoa?
24 | # MAGIC %sql
25 | # MAGIC select name,* from db_demo.person
26 |
27 | # COMMAND ----------
28 |
29 | # DBTITLE 1,Usar na tabela Delta?
30 | # MAGIC %sql
31 | # MAGIC select input_file_name(),name,* from db_demo.person
32 |
33 | # COMMAND ----------
34 |
35 | # DBTITLE 1,Adicionando uma nova coluna nomeArquivo
36 | # MAGIC %py
37 | # MAGIC from pyspark.sql.functions import input_file_name
38 | # MAGIC df.withColumn("nomeArquivo",input_file_name()) \
39 | # MAGIC .write.format('delta') \
40 | # MAGIC .mode('overwrite') \
41 | # MAGIC .option("overwriteSchema", True) \
42 | # MAGIC .saveAsTable("db_demo.person",path='abfss://datalake@storageunitycatalogdemo.dfs.core.windows.net/bronze/person')
43 |
44 | # COMMAND ----------
45 |
46 | # DBTITLE 1,Agora sim!
47 | # MAGIC %sql
48 | # MAGIC select nomeArquivo,name,* from db_demo.person
49 |
--------------------------------------------------------------------------------
/tips/count/00000000000000000001.json:
--------------------------------------------------------------------------------
1 | {"commitInfo":{"timestamp":1683375419086,"userId":"8675301566931963","userName":"reginaldo.silva@dataside.com.br","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"2263512646416784"},"clusterId":"0213-212148-y5jr9wle","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"4064"},"engineInfo":"Databricks-Runtime/12.1.x-scala2.12","txnId":"826be0c3-6b85-4652-85b7-cd6fa83da78f"}}
2 | {"add":{"path":"part-00000-c83f8e97-cbf8-4034-8775-27e5b3f0466c-c000.snappy.parquet","partitionValues":{},"size":4064,"modificationTime":1683375418000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"patient_id\":\"1000003211\",\"sex\":\"male\",\"age\":\"31s\",\"country\":\"Brazil\",\"province\":\"Sao Paulo\",\"city\":\"Boituva\",\"infection_case\":\"Dataholic\",\"contact_number\":\"12\",\"symptom_onset_date\":\"2023-05-06\",\"confirmed_date\":\"2023-05-06\",\"released_date\":\"2023-05-06\",\"state\":\"released\"},\"maxValues\":{\"patient_id\":\"1000003211\",\"sex\":\"male\",\"age\":\"31s\",\"country\":\"Brazil\",\"province\":\"Sao Paulo\",\"city\":\"Boituva\",\"infection_case\":\"Dataholic\",\"contact_number\":\"12\",\"symptom_onset_date\":\"2023-05-06\",\"confirmed_date\":\"2023-05-06\",\"released_date\":\"2023-05-06\",\"state\":\"released\"},\"nullCount\":{\"patient_id\":0,\"sex\":0,\"age\":0,\"country\":0,\"province\":0,\"city\":0,\"infection_case\":0,\"infected_by\":1,\"contact_number\":0,\"symptom_onset_date\":0,\"confirmed_date\":0,\"released_date\":0,\"deceased_date\":1,\"state\":0}}","tags":{"INSERTION_TIME":"1683375418000000","MIN_INSERTION_TIME":"1683375418000000","MAX_INSERTION_TIME":"1683375418000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
3 |
--------------------------------------------------------------------------------
/tips/widgets/Widgets.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Definindo manualmente um Widget do tipo texto
3 | #Define Widgets
4 | dbutils.widgets.text('path', '')
5 | dbutils.widgets.text('dataini', '')
6 | dbutils.widgets.text('datafim', '')
7 | dbutils.widgets.dropdown('debug', 'False', ['True','False'])
8 |
9 | # Define variaveis
10 | path = dbutils.widgets.get('path')
11 | dataini = dbutils.widgets.get('dataini')
12 | datafim = dbutils.widgets.get('datafim')
13 | debug = dbutils.widgets.get('debug') == 'True' # Retorna um boolean
14 |
15 | # Se for modo Debug apenas mostra o valor das variaveis, senao executa um comando, nesse caso o dbutils
16 | if(debug):
17 | print('path : ',path)
18 | print('dataini: ',dataini)
19 | print('datafim: ',datafim)
20 | else:
21 | dbutils.fs.ls(path)
22 |
23 | # COMMAND ----------
24 |
25 | # DBTITLE 1,Chamando uma função usando uma váriavel
26 | def getDirContent(ls_path):
27 | path_list = dbutils.fs.ls(ls_path)
28 | for dir_path in dbutils.fs.ls(ls_path):
29 | if dir_path.isDir() and ls_path != dir_path.path and '_delta_log' not in dir_path.path:
30 | path_list += getDirContent(dir_path.path)
31 |
32 | getDirContent(path)
33 |
34 | # COMMAND ----------
35 |
36 | # DBTITLE 1,Chamando uma função com valor fixo
37 | def getDirContent(ls_path):
38 | path_list = dbutils.fs.ls(ls_path)
39 | for dir_path in dbutils.fs.ls(ls_path):
40 | if dir_path.isDir() and ls_path != dir_path.path and '_delta_log' not in dir_path.path:
41 | path_list += getDirContent(dir_path.path)
42 |
43 | getDirContent('/databricks-datasets/COVID/USAFacts/')
44 |
45 | # COMMAND ----------
46 |
47 | dbutils.widgets.removeAll()
48 |
49 | # COMMAND ----------
50 |
51 | # DBTITLE 1,Veja as opções de Widgets
52 | dbutils.widgets.help()
53 |
--------------------------------------------------------------------------------
/API/databricks/API SQL Statement.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import pandas as pd
4 | from tabulate import tabulate
5 | import matplotlib.pyplot as plt
6 |
7 | url = f"https://adb-xxxx.13.azuredatabricks.net/api/2.0/sql/statements/"
8 |
9 | headers = {
10 | 'Authorization': "Bearer xxxx-3",
11 | "Content-Type": "application/json"
12 | }
13 |
14 | data = {
15 | "warehouse_id": "xxxxxx",
16 | "statement": "select date_format(usage_end_time,'yyyy-MM') as Mes, \
17 | sum(usage_quantity) as DBUs, \
18 | (sum(usage_quantity) * max(c.pricing.default)) as TotalUSD \
19 | from system.billing.usage a \
20 | inner join system.billing.list_prices c on c.sku_name = a.sku_name \
21 | group by all order by 1 desc limit 10",
22 | "wait_timeout": "5s"
23 | }
24 |
25 | response = requests.post(
26 | url = url,
27 | headers=headers,
28 | data=json.dumps(data)
29 | )
30 |
31 | result = json.loads(response.content)
32 |
33 | print("Status Code:", response.status_code)
34 | print(json.dumps(result,indent=4))
35 |
36 | # Extrair colunas e dados
37 | columns = [col["name"] for col in result["manifest"]["schema"]["columns"]]
38 | data = result["result"]["data_array"]
39 |
40 | print(columns)
41 | print(data)
42 |
43 | # Criar DataFrame
44 | df = pd.DataFrame(data, columns=columns)
45 |
46 | # Print tabulado
47 | print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))
48 |
49 | df["TotalUSD"] = pd.to_numeric(df["TotalUSD"])
50 |
51 | # Plotar o gráfico
52 | plt.plot(df['Mes'], df['TotalUSD'],marker='o', linestyle='-', color='b')
53 |
54 | # Adicionar rótulos e título
55 | plt.xlabel('Mes')
56 | plt.ylabel('TotalUSD')
57 | plt.title('Consumo DBUS')
58 |
59 | # Mostrar o gráfico
60 | plt.grid(True)
61 | plt.show()
62 |
--------------------------------------------------------------------------------
/tips/Table lineage/Usabilidade por usuario.sql:
--------------------------------------------------------------------------------
1 | -- tabelas mais lidas por usuário
2 | -- Aplique filtros de datas se achar necessário
3 | -- Customize conforme sua necessidade
4 | select
5 | loginName,
6 | catalogName,
7 | schemaName,
8 | tableName,
9 | sum(READS) as READS,
10 | sum(WRITES) as WRITES
11 | from
12 | (
13 | select
14 | read.created_By as loginName,
15 | t.table_catalog as catalogName,
16 | t.table_schema as schemaName,
17 | t.table_name as tableName,
18 | sum(
19 | case
20 | when read.source_table_name is not null then 1
21 | else 0
22 | end
23 | ) READS,
24 | 0 WRITES
25 | from
26 | system.information_schema.tables t
27 | left join system.access.table_lineage read on t.table_name = read.source_table_name
28 | and t.table_schema = read.source_table_schema
29 | and t.table_catalog = read.source_table_catalog
30 | where
31 | t.table_catalog not in('system')
32 | and t.table_schema not in('information_schema')
33 | and (
34 | read.target_type in('TABLE')
35 | )
36 | group by
37 | all
38 | union all
39 | select
40 | write.created_By as loginName,
41 | t.table_catalog as catalogName,
42 | t.table_schema as schemaName,
43 | t.table_name as tableName,
44 | 0 READS,
45 | sum(
46 | case
47 | when write.target_table_name is not null then 1
48 | else 0
49 | end
50 | ) WRITES
51 | from
52 | system.information_schema.tables t
53 | left join system.access.table_lineage write on t.table_name = write.target_table_name
54 | and t.table_schema = write.target_table_schema
55 | and t.table_catalog = write.target_table_catalog
56 | where
57 | t.table_catalog not in('system')
58 | and t.table_schema not in('information_schema')
59 | and (
60 | write.target_type in('TABLE')
61 | )
62 | group by
63 | all
64 | ) Tabs
65 | group by
66 | all
67 | order by
68 | 1 desc
69 |
--------------------------------------------------------------------------------
/tips/Table lineage/Usabilidade por dia.sql:
--------------------------------------------------------------------------------
1 | -- tabelas mais lidas por dia
2 | -- Aplique filtros de datas se achar necessário
3 | -- Customize conforme sua necessidade
4 | select
5 | event_date,
6 | catalogName,
7 | schemaName,
8 | tableName,
9 | sum(READS) as READS,
10 | sum(WRITES) as WRITES
11 | from
12 | (
13 | select
14 | event_date,
15 | t.table_catalog as catalogName,
16 | t.table_schema as schemaName,
17 | t.table_name as tableName,
18 | sum(
19 | case
20 | when read.source_table_name is not null then 1
21 | else 0
22 | end
23 | ) READS,
24 | 0 WRITES
25 | from
26 | system.information_schema.tables t
27 | left join system.access.table_lineage read on t.table_name = read.source_table_name
28 | and t.table_schema = read.source_table_schema
29 | and t.table_catalog = read.source_table_catalog
30 | where
31 | t.table_catalog not in('system')
32 | and t.table_schema not in('information_schema')
33 | and (
34 | read.target_type in('TABLE')
35 | or read.target_type is null
36 | )
37 | group by
38 | all
39 | union all
40 | select
41 | event_date,
42 | t.table_catalog as catalogName,
43 | t.table_schema as schemaName,
44 | t.table_name as tableName,
45 | 0 READS,
46 | sum(
47 | case
48 | when write.target_table_name is not null then 1
49 | else 0
50 | end
51 | ) WRITES
52 | from
53 | system.information_schema.tables t
54 | left join system.access.table_lineage write on t.table_name = write.target_table_name
55 | and t.table_schema = write.target_table_schema
56 | and t.table_catalog = write.target_table_catalog
57 | where
58 | t.table_catalog not in('system')
59 | and t.table_schema not in('information_schema')
60 | and (
61 | write.target_type in('TABLE')
62 | or write.target_type is null
63 | )
64 | group by
65 | all
66 | ) Tabs
67 | group by
68 | all
69 | order by
70 | 1 desc
71 |
--------------------------------------------------------------------------------
/tips/foreach/Job Foreach - Ingestion Tables.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Job Foreach - Ingestion Tables",
3 | "email_notifications": {
4 | "no_alert_for_skipped_runs": false
5 | },
6 | "webhook_notifications": {},
7 | "timeout_seconds": 0,
8 | "max_concurrent_runs": 1,
9 | "tasks": [
10 | {
11 | "task_key": "ListTables",
12 | "run_if": "ALL_SUCCESS",
13 | "notebook_task": {
14 | "notebook_path": "/Workspace/Scripts/ListaTables",
15 | "source": "WORKSPACE"
16 | },
17 | "timeout_seconds": 0,
18 | "email_notifications": {},
19 | "notification_settings": {
20 | "no_alert_for_skipped_runs": false,
21 | "no_alert_for_canceled_runs": false,
22 | "alert_on_last_attempt": false
23 | },
24 | "webhook_notifications": {}
25 | },
26 | {
27 | "task_key": "ForeachTable",
28 | "depends_on": [
29 | {
30 | "task_key": "ListTables"
31 | }
32 | ],
33 | "run_if": "ALL_SUCCESS",
34 | "for_each_task": {
35 | "inputs": "{{tasks.ListTables.values.tableList}}",
36 | "concurrency": 5,
37 | "task": {
38 | "task_key": "IngestTable",
39 | "run_if": "ALL_SUCCESS",
40 | "notebook_task": {
41 | "notebook_path": "/Workspace/Scripts/IngestTable",
42 | "base_parameters": {
43 | "tableConfig": "{{input}}"
44 | },
45 | "source": "WORKSPACE"
46 | },
47 | "timeout_seconds": 0,
48 | "email_notifications": {},
49 | "notification_settings": {
50 | "no_alert_for_skipped_runs": false,
51 | "no_alert_for_canceled_runs": false,
52 | "alert_on_last_attempt": false
53 | },
54 | "webhook_notifications": {}
55 | }
56 | },
57 | "timeout_seconds": 0,
58 | "email_notifications": {},
59 | "notification_settings": {
60 | "no_alert_for_skipped_runs": false,
61 | "no_alert_for_canceled_runs": false,
62 | "alert_on_last_attempt": false
63 | },
64 | "webhook_notifications": {}
65 | }
66 | ],
67 | "queue": {
68 | "enabled": true
69 | },
70 | "run_as": {
71 | "user_name": "reginaldo.silva27@hotmail.com"
72 | }
73 | }
--------------------------------------------------------------------------------
/tips/count/Count(_) vs Count(1).sql:
--------------------------------------------------------------------------------
1 | -- Databricks notebook source
2 | -- DBTITLE 1,Cria tabela de teste e COUNT de linhas
3 | -- MAGIC %py
4 | -- MAGIC df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv')
5 | -- MAGIC df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDelta",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta')
6 | -- MAGIC df.count()
7 |
8 | -- COMMAND ----------
9 |
10 | -- DBTITLE 1,Visualizar os dados
11 | select * from db_demo.PatientInfoDelta
12 |
13 | -- COMMAND ----------
14 |
15 | -- DBTITLE 1,Ver plano de execução
16 | explain extended select count(*) from db_demo.monitoramento
17 |
18 | -- COMMAND ----------
19 |
20 | -- DBTITLE 1,Insere novo registro para gerar uma nova versão
21 | insert into db_demo.PatientInfoDelta values('1000003211','male','31s','Brazil','Sao Paulo','Boituva','Dataholic',null,12,current_date(),current_date(),current_date(),null,'released')
22 |
23 | -- COMMAND ----------
24 |
25 | -- DBTITLE 1,Deleta 11 registros para gerar uma nova versão
26 | delete from db_demo.PatientInfoDelta where patient_id between '1000000001' and '1000000011'
27 |
28 | -- COMMAND ----------
29 |
30 | -- DBTITLE 1,Visualizar tamanho da tabela
31 | describe detail db_demo.PatientInfoDelta
32 |
33 | -- COMMAND ----------
34 |
35 | -- DBTITLE 1,Visualizar versões da tabela
36 | describe history db_demo.PatientInfoDelta
37 |
38 | -- COMMAND ----------
39 |
40 | -- DBTITLE 1,Count de linha atual
41 | select count(*) from db_demo.PatientInfoDelta
42 |
43 | -- COMMAND ----------
44 |
45 | -- DBTITLE 1,COUNT usando o Delta Log
46 | select
47 | sum(from_json(
48 | add.stats,'numRecords DOUBLE'
49 | ).numRecords) as numRecordsAdd
50 | from
51 | json.`abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta/_delta_log/0000000000000000*.json`
52 | where add is not null
53 | and add.path NOT IN (
54 | select remove.path from json.`abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta/_delta_log/0000000000000000*.json`
55 | where remove is not null
56 | )
57 |
58 | -- COMMAND ----------
59 |
60 | -- DBTITLE 1,Visualizando os metados do _delta_log
61 | select
62 | from_json(
63 | add.stats,'numRecords DOUBLE'
64 | ).numRecords as numRecordsAdd,
65 | *
66 | from
67 | json.`abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta/_delta_log/0000000000000000*.json`
68 |
--------------------------------------------------------------------------------
/tips/Table lineage/Usabilidade das tabelas.sql:
--------------------------------------------------------------------------------
1 | -- tabelas mais lidas
2 | -- Aplique filtros de datas se achar necessário
3 | -- Customize conforme sua necessidade
4 | select
5 | catalogName,
6 | schemaName,
7 | tableName,
8 | min(first_read) as first_read,
9 | max(last_read) as last_read,
10 | min(first_write) as first_write,
11 | max(last_write) as last_write,
12 | sum(READS) as READS,
13 | sum(WRITES) as WRITES
14 | from
15 | (
16 | select
17 | t.table_catalog as catalogName,
18 | t.table_schema as schemaName,
19 | t.table_name as tableName,
20 | MIN(read.event_date) first_read,
21 | MAX(read.event_date) last_read,
22 | null first_write,
23 | null last_write,
24 | sum(
25 | case
26 | when read.source_table_name is not null then 1
27 | else 0
28 | end
29 | ) READS,
30 | 0 WRITES
31 | from
32 | system.information_schema.tables t
33 | left join system.access.table_lineage read on t.table_name = read.source_table_name
34 | and t.table_schema = read.source_table_schema
35 | and t.table_catalog = read.source_table_catalog
36 | where
37 | t.table_catalog not in('system')
38 | and t.table_schema not in('information_schema')
39 | and (
40 | read.target_type in('TABLE')
41 | or read.target_type is null
42 | )
43 | group by
44 | all
45 | union all
46 | select
47 | t.table_catalog as catalogName,
48 | t.table_schema as schemaName,
49 | t.table_name as tableName,
50 | null first_read,
51 | null last_read,
52 | MIN(write.event_date) first_write,
53 | MAX(write.event_date) last_write,
54 | 0 READS,
55 | sum(
56 | case
57 | when write.target_table_name is not null then 1
58 | else 0
59 | end
60 | ) WRITES
61 | from
62 | system.information_schema.tables t
63 | left join system.access.table_lineage write on t.table_name = write.target_table_name
64 | and t.table_schema = write.target_table_schema
65 | and t.table_catalog = write.target_table_catalog
66 | where
67 | t.table_catalog not in('system')
68 | and t.table_schema not in('information_schema')
69 | and (
70 | write.target_type in('TABLE')
71 | or write.target_type is null
72 | )
73 | group by
74 | all
75 | ) Tabs
76 | group by
77 | all
78 | order by
79 | READS DESC,
80 | WRITES DESC
81 |
--------------------------------------------------------------------------------
/routines/OptimizeAndVacuum/README.md:
--------------------------------------------------------------------------------
1 |
2 |
Descriçao dos parametros
3 |
4 | | Parametro | Descrição | Tipo
5 | | ------------- | ------------- | ------------- |
6 | | nomeSchema | Nome do Database onde a tabela está criada | string |
7 | | nomeTabela | Nome da tabela que será aplicado a manutenção | string |
8 | | vacuum | True: Vacuum será executado, False: Pula vacuum | bool |
9 | | optimize | True: OPTIMIZE será executado, False: Pula OPTIMIZE | bool |
10 | | colunasZorder | Se informado e optimize for igual a True, aplicada Zorder na lista de colunas separado por vírgula (,) | string |
11 | | vacuumRetention | Quantidade de horas que será retida após execucao do Vacuum | integer |
12 | | Debug | Apenas imprime o resultado na tela | bool |
13 |
14 | Exemplos:
15 |
16 | #### --> Primeiro instanciar a Function <--
17 | `` %run /Users/reginaldo.silva@dataside.com.br/OptimizeAndVacuum ``
18 |
19 | #### --> Executando VACUUM com retenção de 72 horas e OPTMIZE SEM ZORDER <--
20 | ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='funcionario', colunasZorder='none', vacuumRetention=72, vacuum=True, optimize=True, debug=False)``
21 |
22 | #### --> Executando VACUUM retenção padrão e OPTMIZE COM ZORDER <--
23 | ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='patient_id', vacuumRetention=168, vacuum=True, optimize=True, debug=False)``
24 |
25 | #### --> Executando somente VACUUM <--
26 | ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=False, debug=False)``
27 |
28 | #### --> Executando somente OPTMIZE <--
29 | ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=False, optimize=True, debug=False)``
30 |
31 | #### --> Modo Debug - Apenas print <--
32 | ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=True, debug=True)``
33 |
34 | ``Criado por: Reginaldo Silva``
35 | - [Blog Data In Action](https://datainaction.dev/)
36 | - [Github](https://github.com/reginaldosilva27)
37 |
38 | ``Referencias:``
39 | -
40 | -
41 | -
42 |
--------------------------------------------------------------------------------
/tips/deletionVector/DeletionVectors.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Cria uma nova tabela
3 | df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv')
4 | df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDelta",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta')
5 | df.count()
6 |
7 | # COMMAND ----------
8 |
9 | # DBTITLE 1,Ver detalhes da tabela
10 | # MAGIC %sql
11 | # MAGIC describe extended db_demo.PatientInfoDelta
12 |
13 | # COMMAND ----------
14 |
15 | # DBTITLE 1,Delete sem Deletion Vector
16 | # MAGIC %sql
17 | # MAGIC delete from db_demo.PatientInfoDelta where patient_id = 1000000002
18 |
19 | # COMMAND ----------
20 |
21 | # DBTITLE 1,Habilitar Deletion Vector - Irá realizar Upgrade do protocolo Delta
22 | # MAGIC %sql
23 | # MAGIC ALTER TABLE db_demo.PatientInfoDelta SET TBLPROPERTIES ('delta.enableDeletionVectors' = true);
24 |
25 | # COMMAND ----------
26 |
27 | # DBTITLE 1,Delete com Deletion Vector
28 | # MAGIC %sql
29 | # MAGIC delete from db_demo.PatientInfoDelta where patient_id = 1000000001
30 |
31 | # COMMAND ----------
32 |
33 | # DBTITLE 1,COUNT para validar
34 | # MAGIC %sql
35 | # MAGIC select count(*) from db_demo.PatientInfoDelta
36 |
37 | # COMMAND ----------
38 |
39 | # DBTITLE 1,Update com Deletion Vector? Somente com Photon
40 | # MAGIC %sql
41 | # MAGIC update db_demo.PatientInfoDelta set sex = 'male' where patient_id = '1000000033'
42 |
43 | # COMMAND ----------
44 |
45 | # DBTITLE 1,Limpando versões e deletion vectors
46 | # MAGIC %sql
47 | # MAGIC set spark.databricks.delta.retentionDurationCheck.enabled = false;
48 | # MAGIC VACUUM db_demo.PatientInfoDelta RETAIN 0 HOURS
49 |
50 | # COMMAND ----------
51 |
52 | # DBTITLE 1,Deletes com Deletion Vector - Testando performance
53 | id = 1000000001
54 | while 1 == 1:
55 | spark.sql(f"delete from db_demo.PatientInfoDelta where patient_id = {id}")
56 | print(id)
57 | id=id+1
58 |
59 | # COMMAND ----------
60 |
61 | df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv')
62 | df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDeltaSemDeletion",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDeltaSemDeletion')
63 |
64 | # COMMAND ----------
65 |
66 | # DBTITLE 1,Deletes SEM Deletion Vector - Testando performance
67 | id = 1000000001
68 | while 1 == 1:
69 | spark.sql(f"delete from db_demo.PatientInfoDeltaSemDeletion where patient_id = {id}")
70 | print(id)
71 | id=id+1
72 |
--------------------------------------------------------------------------------
/tips/dbutils/Dbutils-Dataframe.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Usando o Dbutils.fs.ls para listar uma pasta
3 | dbutils.fs.ls('/databricks-datasets/COVID/')
4 |
5 | # COMMAND ----------
6 |
7 | # DBTITLE 1,Transformar Dbutils em Dataframe
8 | # MAGIC %py
9 | # MAGIC from pyspark.sql.types import StructType, StructField, IntegerType, StringType
10 | # MAGIC
11 | # MAGIC ddlSchema = StructType([
12 | # MAGIC StructField('path',StringType()),
13 | # MAGIC StructField('name',StringType()),
14 | # MAGIC StructField('size',IntegerType()),
15 | # MAGIC StructField('modificationTime',StringType())
16 | # MAGIC ])
17 | # MAGIC
18 | # MAGIC ls = dbutils.fs.ls('/databricks-datasets/COVID/')
19 | # MAGIC dfPath = spark.createDataFrame(ls,ddlSchema)
20 | # MAGIC dfPath.createOrReplaceTempView('vw_Files')
21 |
22 | # COMMAND ----------
23 |
24 | # DBTITLE 1,Consultando com SQL
25 | # MAGIC %sql
26 | # MAGIC -- Note que temos apenas 2 arquivos os demais sao pastas
27 | # MAGIC select count(*) qtdFiles,sum(size) / 1024 / 1024 as size_Mb from vw_Files where size > 0
28 |
29 | # COMMAND ----------
30 |
31 | # DBTITLE 1,Visualizando estrutura
32 | # MAGIC %sql
33 | # MAGIC -- As pastas sempre ficam com Size 0 mesmo tendo arquivos dentro
34 | # MAGIC select * from vw_Files
35 |
36 | # COMMAND ----------
37 |
38 | # DBTITLE 1,Função recursiva para listar todos niveis de pasta
39 | # Basicamente é uma função que chama ela mesma durante a execução
40 | def get_dir_content(ls_path):
41 | path_list = dbutils.fs.ls(ls_path)
42 | for dir_path in dbutils.fs.ls(ls_path):
43 | if dir_path.isDir() and ls_path != dir_path.path and '_delta_log' not in dir_path.path:
44 | path_list += get_dir_content(dir_path.path)
45 | return path_list
46 |
47 | # COMMAND ----------
48 |
49 | # DBTITLE 1,Agora vamos usar nossa função para gerar o DataFrame
50 | # MAGIC %py
51 | # MAGIC from pyspark.sql.types import StructType, StructField, IntegerType, StringType
52 | # MAGIC
53 | # MAGIC ddlSchema = StructType([
54 | # MAGIC StructField('path',StringType()),
55 | # MAGIC StructField('name',StringType()),
56 | # MAGIC StructField('size',IntegerType()),
57 | # MAGIC StructField('modificationTime',StringType())
58 | # MAGIC ])
59 | # MAGIC
60 | # MAGIC dfPath = spark.createDataFrame(get_dir_content('/databricks-datasets/COVID/covid-19-data'),ddlSchema)
61 | # MAGIC dfPath.createOrReplaceTempView('vw_Files')
62 |
63 | # COMMAND ----------
64 |
65 | # DBTITLE 1,Agora temos todos os arquivos daquela pasta e subpastas
66 | # MAGIC %sql
67 | # MAGIC select count(*) qtdFiles,sum(size) / 1024 / 1024 as size_Mb from vw_Files where size > 0
68 |
--------------------------------------------------------------------------------
/tips/count/00000000000000000002.json:
--------------------------------------------------------------------------------
1 | {"commitInfo":{"timestamp":1683375447311,"userId":"8675301566931963","userName":"reginaldo.silva@dataside.com.br","operation":"DELETE","operationParameters":{"predicate":"[\"((spark_catalog.db_demo.PatientInfoDelta.patient_id >= '1000000001') AND (spark_catalog.db_demo.PatientInfoDelta.patient_id <= '1000000011'))\"]"},"notebook":{"notebookId":"2263512646416784"},"clusterId":"0213-212148-y5jr9wle","readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"1","numCopiedRows":"5154","numDeletionVectorsAdded":"0","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"0","executionTimeMs":"1260","numDeletedRows":"11","scanTimeMs":"674","numAddedFiles":"1","rewriteTimeMs":"586"},"engineInfo":"Databricks-Runtime/12.1.x-scala2.12","txnId":"7da2bc04-7b91-4796-805c-aa05261e4c71"}}
2 | {"remove":{"path":"part-00000-dd7b4b44-cbd0-40ac-9549-e9fa424e2888-c000.snappy.parquet","deletionTimestamp":1683375447309,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":53856,"tags":{"INSERTION_TIME":"1683375359000000","MIN_INSERTION_TIME":"1683375359000000","MAX_INSERTION_TIME":"1683375359000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
3 | {"add":{"path":"part-00000-4024a4e5-5f88-4091-980f-2e9c49b1ef79-c000.snappy.parquet","partitionValues":{},"size":53373,"modificationTime":1683375447000,"dataChange":true,"stats":"{\"numRecords\":5154,\"minValues\":{\"patient_id\":\"1000000012\",\"sex\":\"female\",\"age\":\"0s\",\"country\":\"Bangladesh\",\"province\":\"Busan\",\"city\":\"Andong-si\",\"infection_case\":\"Anyang Gunpo Pastors Group\",\"infected_by\":\"1000000003\",\"contact_number\":\"-\",\"symptom_onset_date\":\" \",\"confirmed_date\":\"2020-01-20\",\"released_date\":\"2020-02-06\",\"deceased_date\":\"2020-02-19\",\"state\":\"deceased\"},\"maxValues\":{\"patient_id\":\"7000000019\",\"sex\":\"male\",\"age\":\"90s\",\"country\":\"Vietnam\",\"province\":\"Ulsan\",\"city\":\"sankyeock-dong\",\"infection_case\":\"overseas inflow\",\"infected_by\":\"7000000009\",\"contact_number\":\"95\",\"symptom_onset_date\":\"2020-06-28\",\"confirmed_date\":\"2020-06-30\",\"released_date\":\"2020-06-28\",\"deceased_date\":\"2020-05-25\",\"state\":\"released\"},\"nullCount\":{\"patient_id\":0,\"sex\":1122,\"age\":1380,\"country\":0,\"province\":0,\"city\":94,\"infection_case\":919,\"infected_by\":3813,\"contact_number\":4374,\"symptom_onset_date\":4466,\"confirmed_date\":3,\"released_date\":3578,\"deceased_date\":5088,\"state\":0}}","tags":{"MAX_INSERTION_TIME":"1683375359000000","INSERTION_TIME":"1683375359000000","MIN_INSERTION_TIME":"1683375359000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
4 |
--------------------------------------------------------------------------------
/tips/deltaTable/Protocols.sql:
--------------------------------------------------------------------------------
1 | -- Databricks notebook source
2 | -- DBTITLE 1,Cria tabela básica sem nenhuma feature nova
3 | create table tb_teste (campo1 int);
4 | describe extended tb_teste;
5 |
6 | -- COMMAND ----------
7 |
8 | -- DBTITLE 1,Realizando upgrade para utilizar o CDC
9 | -- Upgrades the reader protocol version to 1 and the writer protocol version to 4.
10 | ALTER TABLE tb_teste SET TBLPROPERTIES('delta.minReaderVersion' = '1', 'delta.minWriterVersion' = '4');
11 | describe extended tb_teste;
12 |
13 | -- COMMAND ----------
14 |
15 | -- DBTITLE 1,Criando uma tabela com CDC habilitado
16 | create table tb_teste2 (campo1 int) TBLPROPERTIES (delta.enableChangeDataFeed = true);
17 | describe extended tb_teste2;
18 |
19 | -- COMMAND ----------
20 |
21 | -- DBTITLE 1,Criando uma tabela com a ultima versão para usar Deletion Vector
22 | drop table tb_teste3;
23 | create table tb_teste3 (campo1 int) TBLPROPERTIES('delta.minReaderVersion' = '3', 'delta.minWriterVersion' = '7');
24 | describe extended tb_teste3;
25 |
26 | -- COMMAND ----------
27 |
28 | -- DBTITLE 1,Habilitando Deletion Vector
29 | alter table tb_teste3 SET TBLPROPERTIES ('delta.enableDeletionVectors' = true);
30 | describe extended tb_teste3;
31 |
32 | -- COMMAND ----------
33 |
34 | -- DBTITLE 1,Tentando usar uma feature não suportada pelo Databricks Runtime
35 | alter table tb_teste3 SET TBLPROPERTIES ('delta.feature.timestampNtz' = 'supported')
36 |
37 | -- COMMAND ----------
38 |
39 | -- DBTITLE 1,Downgrade
40 | ALTER TABLE tb_teste3 SET TBLPROPERTIES('delta.minReaderVersion' = '1', 'delta.minWriterVersion' = '4')
41 |
42 | -- COMMAND ----------
43 |
44 | create table tb_teste4 (campo1 int) TBLPROPERTIES (delta.enableChangeDataFeed = true);
45 | describe extended tb_teste4;
46 |
47 | -- COMMAND ----------
48 |
49 | -- DBTITLE 1,Tentando ler a tabela com Runtime 11.3
50 | select * from tb_teste3
51 |
52 | -- COMMAND ----------
53 |
54 | -- DBTITLE 1,Habilitando timestampNtz
55 | create table tb_teste5 (campo1 int) TBLPROPERTIES (delta.feature.timestampNtz = 'supported');
56 | describe extended tb_teste5;
57 |
58 | -- COMMAND ----------
59 |
60 | create table tb_teste6 (campo1 int) TBLPROPERTIES (delta.feature.enableDeletionVectors = 'supported');
61 | describe extended tb_teste6;
62 |
63 | -- COMMAND ----------
64 |
65 | -- DBTITLE 1,Resumo do Table Features
66 | CREATE TABLE db_demo.teste7 (
67 | patient_id STRING)
68 | USING delta
69 | LOCATION 'abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/teste7'
70 | TBLPROPERTIES (
71 | 'delta.enableDeletionVectors' = 'true',
72 | 'delta.feature.appendOnly' = 'supported',
73 | 'delta.feature.deletionVectors' = 'supported',
74 | 'delta.feature.invariants' = 'supported',
75 | 'delta.minReaderVersion' = '3',
76 | 'delta.minWriterVersion' = '7')
77 |
--------------------------------------------------------------------------------
/tips/run/notebook1.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Defini variaveis notebook 1
3 | dataini = '2023-01-01'
4 | datafim = '2023-03-31'
5 |
6 | # COMMAND ----------
7 |
8 | # DBTITLE 1,Chama notebook 2 com notebook.run()
9 | dbutils.notebook.run('/Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2',
10 | -30,
11 | {"dataini": dataini, "datafim": datafim}
12 | )
13 |
14 | # COMMAND ----------
15 |
16 | # DBTITLE 1,Utilizando %Run - Passando valores fixos funciona
17 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2 $dataini=2023-01-01 $datafim=2023-03-31
18 |
19 | # COMMAND ----------
20 |
21 | # DBTITLE 1,Passando varaveis não funciona
22 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2 $dataini=dataini $datafim=datafim
23 |
24 | # COMMAND ----------
25 |
26 | # DBTITLE 1,Definindo Widgets
27 | dbutils.widgets.text("dataini", "2023-01-01")
28 | dbutils.widgets.text("datafim", "2023-03-31")
29 |
30 | # COMMAND ----------
31 |
32 | # DBTITLE 1,utilizando widgets parece que funciona ne?
33 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2
34 |
35 | # COMMAND ----------
36 |
37 | # DBTITLE 1,Atualizando valores
38 | dbutils.widgets.text("dataini", "2023-02-01")
39 | dbutils.widgets.text("datafim", "2023-02-28")
40 |
41 | # COMMAND ----------
42 |
43 | # DBTITLE 1,Com widgets - Valores não atualizaram
44 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2
45 |
46 | # COMMAND ----------
47 |
48 | # DBTITLE 1,Limpando Widgets
49 | dbutils.widgets.removeAll()
50 |
51 | # COMMAND ----------
52 |
53 | # DBTITLE 1,Criando tabela de parametros
54 | # MAGIC %sql
55 | # MAGIC drop table if exists tb_parameters;
56 | # MAGIC create table if not exists tb_parameters (dataini date, datafim date);
57 | # MAGIC insert into tb_parameters values('2023-01-01','2023-03-31');
58 |
59 | # COMMAND ----------
60 |
61 | # MAGIC %sql
62 | # MAGIC select * from tb_parameters
63 |
64 | # COMMAND ----------
65 |
66 | # DBTITLE 1,Agora sim!
67 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook3
68 |
69 | # COMMAND ----------
70 |
71 | # DBTITLE 1,Sessao do Spark
72 | # MAGIC %scala
73 | # MAGIC spark
74 |
75 | # COMMAND ----------
76 |
77 | # DBTITLE 1,Contexto
78 | # MAGIC %scala
79 | # MAGIC spark.sparkContext
80 |
81 | # COMMAND ----------
82 |
83 | # DBTITLE 1,Testando com notebook.run()
84 | dbutils.notebook.run('/Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2',
85 | -30,
86 | {"dataini": dataini, "datafim": datafim}
87 | )
88 |
89 | # COMMAND ----------
90 |
91 | # DBTITLE 1,Testando sessao do Spark
92 | # MAGIC %run /Users/reginaldo.silva@dataside.com.br/DemoRun/notebook2
93 |
--------------------------------------------------------------------------------
/API/databricks/Databrick Jobs List - API.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | ##prod
3 | from requests import request
4 | import requests
5 | import json
6 |
7 | instance_id = 'xxxxxx.azuredatabricks.net'
8 |
9 | api_version = '/api/2.1'
10 | api_command = '/jobs/list'
11 | url = f"https://{instance_id}{api_version}{api_command}"
12 | #Adicionar secret
13 | headers = {
14 | 'Authorization': "Bearer xxxxxxx"
15 | }
16 |
17 | has_more = True
18 | count = 0
19 | offset = 0
20 | jsonDataList = []
21 | while has_more:
22 | params = {
23 | 'expand_tasks': 'true',
24 | 'offset': offset
25 | }
26 |
27 | response = requests.get(
28 | url = url,
29 | params = params,
30 | headers= headers
31 | )
32 |
33 | jsonDataList.append(json.dumps(json.loads(response.text), indent = 2))
34 | jsonRDD = sc.parallelize(jsonDataList)
35 | dfProd = spark.read.option('multiline', 'true').option('inferSchema', 'true').json(jsonRDD)
36 | try:
37 | has_more = json.loads(response.text)['has_more']
38 | except:
39 | has_more = False
40 |
41 | count = count + 1
42 | offset = offset + 20
43 | print(count)
44 | print(json.dumps(json.loads(response.text), indent = 2))
45 |
46 | # COMMAND ----------
47 |
48 | from pyspark.sql.functions import *
49 | dfJobsProd = dfProd.select(explode("jobs").alias("jobs")).withColumn("environment", lit("PROD"))
50 | dfJobsProd = dfJobsProd.withColumn('jobname',col('jobs.settings.name').cast('string'))
51 | dfJobsProd.count()
52 |
53 | # COMMAND ----------
54 |
55 | dfJobsProd.select(
56 | dfJobsProd.environment.cast('string').alias("environment"),
57 | dfJobsProd.jobs.job_id.cast('string').alias("job_id"),
58 | dfJobsProd.jobs.creator_user_name.cast('string').alias("creator_user_name"),
59 | dfJobsProd.jobname,
60 | dfJobsProd.jobs.settings.schedule.cast('string').alias("schedule"),
61 | dfJobsProd.jobs.settings.schedule.quartz_cron_expression.cast('string').alias("quartz_cron_expression"),
62 | dfJobsProd.jobs.settings.email_notifications.cast('string').alias("email_notifications"),
63 | dfJobsProd.jobs.settings.timeout_seconds.cast('string').alias("timeout_seconds"),
64 | dfJobsProd.jobs.settings.max_concurrent_runs.cast('string').alias("max_concurrent_runs"),
65 | dfJobsProd.jobs.settings.tasks.cast('string').alias("tasks"),
66 | dfJobsProd.jobs.settings.format.cast('string').alias("format"),
67 | dfJobsProd.jobs.settings.tasks[0].existing_cluster_id.cast('string').alias("existing_cluster_id"),
68 | dfJobsProd.jobs.settings.tasks[1].existing_cluster_id.cast('string').alias("existing_cluster_id2"),
69 | dfJobsProd.jobs.settings.tasks[2].existing_cluster_id.cast('string').alias("existing_cluster_id3"),
70 | to_timestamp(dfJobsProd.jobs.created_time / 1000).alias('created_time')
71 | ).createOrReplaceTempView('vwJobs')
72 |
73 | # COMMAND ----------
74 |
75 | # MAGIC %sql
76 | # MAGIC select * from vwJobs
77 |
--------------------------------------------------------------------------------
/tips/DatabricksSDKPython/Python-SDK.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Instalando SDK
3 | # MAGIC %pip install databricks-sdk --upgrade
4 |
5 | # COMMAND ----------
6 |
7 | # DBTITLE 1,Reiniciando Kernel
8 | dbutils.library.restartPython()
9 |
10 | # COMMAND ----------
11 |
12 | # DBTITLE 1,Listando todos os clusters All Purpose
13 | from databricks.sdk import WorkspaceClient
14 |
15 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx')
16 |
17 | for c in w.clusters.list():
18 | print(c.cluster_name)
19 |
20 | # COMMAND ----------
21 |
22 | # DBTITLE 1,Ligando Clusters
23 | from databricks.sdk import WorkspaceClient
24 |
25 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx')
26 |
27 | for c in w.clusters.list():
28 | try:
29 | print('Ligando Cluster: ', c.cluster_name)
30 | w.clusters.start(cluster_id=c.cluster_id).result()
31 | except:
32 | print('Cluster já está ligado: ', c.cluster_name)
33 |
34 | # COMMAND ----------
35 |
36 | # DBTITLE 1,Listando todos os Jobs e quantidade de clusters e tasks
37 | from databricks.sdk import WorkspaceClient
38 |
39 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx')
40 |
41 | job_list = w.jobs.list(expand_tasks=True)
42 | for j in job_list:
43 | #print(j)
44 | print('job_id: ',j.job_id, ' - name:', j.settings.name, ' - job_clusters:', len(j.settings.job_clusters) if j.settings.job_clusters else 'None', ' - tasks:', len(j.settings.tasks), ' - tags:', j.settings.tags)
45 |
46 | # COMMAND ----------
47 |
48 | # DBTITLE 1,Listando todos os Notebooks e subpastas de usuário corrente
49 | from databricks.sdk import WorkspaceClient
50 |
51 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx')
52 |
53 | names = []
54 | for i in w.workspace.list(f'/Users/{w.current_user.me().user_name}', recursive=True):
55 | names.append(i.path)
56 | print(i.path)
57 | assert len(names) > 0
58 |
59 | # COMMAND ----------
60 |
61 | # DBTITLE 1,Listando todos os Notebooks e subpastas do Workspace
62 | from databricks.sdk import WorkspaceClient
63 |
64 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx')
65 |
66 | names = []
67 | for i in w.workspace.list(f'/', recursive=True):
68 | names.append(i.path)
69 | print(i.path)
70 | assert len(names) > 0
71 |
72 | # COMMAND ----------
73 |
74 | # DBTITLE 1,Listando Users do Workspace
75 | from databricks.sdk import WorkspaceClient
76 | from databricks.sdk.service import iam
77 |
78 | w = WorkspaceClient(host='adb-4013955633331914.14.azuredatabricks.net', token='xxxx')
79 |
80 | all_users = w.users.list(attributes="id,userName",
81 | sort_by="userName",
82 | sort_order=iam.ListSortOrder.DESCENDING)
83 |
84 | for u in all_users:
85 | print(u.user_name)
86 |
--------------------------------------------------------------------------------
/tips/parallel/Paralelismo.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | import time
3 | from datetime import datetime
4 | from concurrent.futures import ThreadPoolExecutor
5 |
6 | # COMMAND ----------
7 |
8 | # DBTITLE 1,Serial Way
9 | # Função que recebe um numero e printa ele na tela junto com a data e hora
10 | def printNumber(number):
11 | try:
12 | print(f"{number} - {datetime.today()}")
13 | time.sleep(1)
14 | except:
15 | print(number + ' - ' + str(datetime.today()))
16 |
17 | # Gerando uma lista de numeros e passando cada um para a função
18 | numbers = range(1,11)
19 | [printNumber(i) for i in numbers]
20 |
21 | # COMMAND ----------
22 |
23 | # DBTITLE 1,Parallel Way
24 | # Essa é a mesma função de printar o numero usada no serial
25 | def printNumber(number):
26 | try:
27 | print(f"{number} - {datetime.today()}")
28 | time.sleep(1)
29 | except:
30 | print(number + ' - ' + str(datetime.today()))
31 |
32 | # Criamos uma função que irá receber uma lista de numeros e printar ele de forma paralela
33 | # Note que especificamos tambem a quantidade maxima de paralelismo que pode ser usada
34 | def parallelInt(numbers, numInParallel):
35 | with ThreadPoolExecutor(max_workers=numInParallel) as ec:
36 | return [ec.submit(printNumber, number) for number in numbers]
37 |
38 | # Definindo a lista de numeros e quantidade de threads em paralelo
39 | numbers = range(1,11)
40 | parallelThreads = 10
41 | print(numbers)
42 | result = parallelInt(numbers,parallelThreads)
43 |
44 | # COMMAND ----------
45 |
46 | # MAGIC %sql
47 | # MAGIC describe history db_festivaldemo.PatientInfoDelta
48 |
49 | # COMMAND ----------
50 |
51 | # MAGIC %sql
52 | # MAGIC -- Gerando Delete de exemplo
53 | # MAGIC delete from db_festivaldemo.PatientInfoDelta where patient_id = 1000000002
54 |
55 | # COMMAND ----------
56 |
57 | # Executando um COUNT(*) em cada versão da tabela
58 | listVersions = spark.sql("describe history db_festivaldemo.PatientInfoDelta").collect()
59 | for row in listVersions:
60 | print(f'Version -> {row.version} - Count: {spark.sql(f"select count(*) as qtd from db_festivaldemo.PatientInfoDelta VERSION AS OF {row.version}").collect()[0][0]} - {datetime.today()}')
61 |
62 | # COMMAND ----------
63 |
64 | # Função para executar um count em cada versão da tabela
65 | def getversion(version):
66 | try:
67 | print(f'Version -> {version} - Count: {spark.sql(f"select count(*) as qtd from db_festivaldemo.PatientInfoDelta VERSION AS OF {version}").collect()[0][0]} - {datetime.today()}')
68 | except:
69 | print(version + ' - ' + str(datetime.today()))
70 |
71 | def parallelInt2(numbers, numInParallel):
72 | with ThreadPoolExecutor(max_workers=numInParallel) as ec:
73 | return [ec.submit(getversion, item.version) for item in listVersions]
74 |
75 | listVersions = spark.sql("describe history db_festivaldemo.PatientInfoDelta").collect()
76 | parallelThreads = 25
77 | result = parallelInt2(numbers,parallelThreads)
78 |
--------------------------------------------------------------------------------
/tips/parameters/RunDevProd.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,1 - Variáveis de ambiente configuradas por Cluster
3 | import os
4 | environment = os.getenv("environment")
5 | database = os.getenv("database")
6 | storageroot = os.getenv("storageroot")
7 |
8 | if environment == 'dev':
9 | print(environment)
10 | print(database)
11 | print(storageroot)
12 |
13 | # Exemplo de utilização:
14 | tbName = 'person'
15 | path = f"{storageroot}\{database}\{tbName}"
16 | print(path)
17 | df.write.option("mergeSchema", "true") \
18 | .mode(f"append") \
19 | .format("delta") \
20 | .saveAsTable(f"{database}.{tbName}",path=path)
21 |
22 | # COMMAND ----------
23 |
24 | # DBTITLE 1,Recuperando Tag Default
25 | spark.conf.get('spark.databricks.clusterUsageTags.clusterId')
26 |
27 | # COMMAND ----------
28 |
29 | # DBTITLE 1,Tag clusterAllTags
30 | spark.conf.get('spark.databricks.clusterUsageTags.clusterAllTags')
31 |
32 | # COMMAND ----------
33 |
34 | # DBTITLE 1,2 - Azure Tags - Automaticamente adicionadas ao cluster
35 | import json
36 | ## Essas tags só podem ser acessadas via clusterAllTags, diferente das Custom e Default
37 | tags = json.loads(spark.conf.get('spark.databricks.clusterUsageTags.clusterAllTags'))
38 | for tag in tags:
39 | if tag["key"] == 'storageroot':
40 | storageroot = tag["value"]
41 | if tag["key"] == 'databricks-environment':
42 | environment = tag["value"]
43 | if tag["key"] == 'department':
44 | department = tag["value"]
45 | if tag["key"] == 'company':
46 | company = tag["value"]
47 |
48 | print(environment)
49 | print(storageroot)
50 | print(department)
51 | print(company)
52 |
53 | # COMMAND ----------
54 |
55 | # DBTITLE 1,3 - Spark Conf - Fixando valor no notebook
56 | workspace = spark.conf.get("spark.databricks.clusterUsageTags.clusterOwnerOrgId")
57 |
58 | if workspace == '5800865833021444': ##dev
59 | instance_id = f'adb-5800865833021444.4.azuredatabricks.net'
60 | storageroot='abfss://lakedev@storageaccountlake.dfs.core.windows.net'
61 | database='db_catalog_dev'
62 | environment='dev'
63 | if workspace == '5800865833021442': ##prod
64 | instance_id = 'adb-5800865833021442.4.azuredatabricks.net'
65 | storageroot='abfss://lakeprod@storageaccountlake.dfs.core.windows.net'
66 | database='db_catalog_prod'
67 | environment='prod'
68 |
69 | print(environment)
70 | print(storageroot)
71 | print(database)
72 | print(instance_id)
73 |
74 | # COMMAND ----------
75 |
76 | # DBTITLE 1,4 - Widgets
77 | # https://www.datainaction.dev/post/databricks-parametrizando-seus-notebooks-like-a-boss-usando-widgets
78 |
79 | # Definindo Widgets manualmente - Não é obrigatório, se você enviar via Job direto funciona
80 | dbutils.widgets.text('environment', '')
81 | dbutils.widgets.text('storageroot', '')
82 | dbutils.widgets.text('database', '')
83 |
84 | # Pegando valor dos Widgets
85 | environment = dbutils.widgets.get('environment')
86 | storageroot = dbutils.widgets.get('storageroot')
87 | database = dbutils.widgets.get('database')
88 |
89 | print(environment)
90 | print(storageroot)
91 | print(database)
92 |
--------------------------------------------------------------------------------
/tips/VNET/README.md:
--------------------------------------------------------------------------------
1 | ### VNET (Virtual Network):
2 | A VNET é uma rede privada no Azure que permite o isolamento de recursos e a comunicação segura entre eles. No contexto de Databricks, a VNET Injection permite que o cluster seja implantado dentro de uma VNET do cliente, oferecendo maior controle sobre o tráfego de rede e a conectividade com recursos externos.
3 |
4 | ### Subnets:
5 | As sub-redes (subnets) dividem uma VNET em segmentos menores. Elas são utilizadas para isolar recursos, controlar o tráfego e aplicar regras de segurança específicas. No Databricks, diferentes subnets podem ser usadas para isolar a comunicação entre clusters e outros serviços.
6 |
7 | ### Network Security Groups (NSG):
8 | Um NSG contém regras de segurança que controlam o tráfego de rede de entrada e saída para os recursos de uma subnet ou interface de rede. Essas regras ajudam a proteger os recursos dentro da VNET, permitindo ou bloqueando o tráfego com base em endereços IP, portas e protocolos.
9 |
10 | ### Private Endpoint:
11 | Um Private Endpoint cria uma interface de rede privada dentro de uma subnet para conectar-se a serviços do Azure (como Databricks ou o Azure Storage) sem expor o tráfego à internet pública. Isso melhora a segurança ao garantir que toda a comunicação aconteça dentro da rede privada.
12 |
13 | ### Private DNS Servers:
14 | Para que os recursos dentro de uma VNET resolvam corretamente nomes de domínio associados a Private Endpoints, é necessário configurar servidores DNS privados. Esses servidores permitem a resolução de endereços IP internos e externos, garantindo a comunicação adequada dentro da infraestrutura isolada.
15 |
16 | ### Peering:
17 | O VNET Peering conecta duas redes virtuais no Azure, permitindo que elas se comuniquem diretamente sem a necessidade de gateways ou roteamento através da internet. No cenário de Databricks, o peering pode ser utilizado para conectar a VNET onde o workspace está injetado com outras VNETs que hospedam recursos críticos.
18 |
19 | ### VPN Gateway:
20 | Um gateway VPN oferece conectividade segura entre a rede on-premises e a VNET do Azure através de uma conexão encriptada (IPsec). Isso permite que os recursos no Azure se conectem à infraestrutura local de forma segura e privada, útil para cenários híbridos.
21 |
22 | ### ExpressRoute:
23 | O ExpressRoute é uma solução de conectividade privada que permite conexões dedicadas e de baixa latência entre a rede on-premises e o Azure, sem passar pela internet pública. É geralmente usada para cargas de trabalho sensíveis e de alta performance, garantindo maior confiabilidade e segurança.
24 |
25 | ### Route Tables:
26 | As tabelas de rota controlam como o tráfego é direcionado dentro de uma VNET. Elas definem rotas personalizadas, permitindo que o tráfego seja direcionado para diferentes subnets, gateways ou outros destinos. No Databricks, as route tables podem ser usadas para garantir que o tráfego siga por caminhos seguros e otimizados.
27 |
28 | ### NAT Gateway:
29 | O NAT Gateway permite que recursos dentro de uma subnet privada acessem a internet de forma segura, mascarando seus endereços IP com um único endereço público de saída. Isso é útil para controlar o tráfego de saída e limitar a exposição direta dos recursos internos à internet pública.
30 |
--------------------------------------------------------------------------------
/routines/OptimizeAndVacuum/Demo.sql:
--------------------------------------------------------------------------------
1 | -- Databricks notebook source
2 | -- MAGIC %py
3 | -- MAGIC dbutils.fs.rm('/mnt/raw/database=covid/table=PatientInfoDelta',True)
4 |
5 | -- COMMAND ----------
6 |
7 | -- DBTITLE 1,Preparando o ambiente
8 | -- MAGIC %py
9 | -- MAGIC #Ambiente lendo CSV de exemplos e salvando como tabela DELTA
10 | -- MAGIC df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv')
11 | -- MAGIC #Salve onde quiser, estou usando um Mount para facilitar
12 | -- MAGIC df.write.format('delta').mode('overwrite').saveAsTable("db_festivaldemo.PatientInfoDelta",path='/mnt/raw/database=covid/table=PatientInfoDelta')
13 | -- MAGIC count = 0
14 | -- MAGIC #Alguns Updates para gerar alguns logs e tudo pronto
15 | -- MAGIC while count < 12:
16 | -- MAGIC spark.sql(f"update db_festivaldemo.PatientInfoDelta set age={count} where patient_id = 1000000001")
17 | -- MAGIC print(count)
18 | -- MAGIC count=count+1
19 |
20 | -- COMMAND ----------
21 |
22 | -- DBTITLE 1,Instanciar o notebook utilizando o RUN
23 | -- MAGIC %run /Users/reginaldo.silva@dataside.com.br/OptimizeAndVacuum
24 |
25 | -- COMMAND ----------
26 |
27 | -- DBTITLE 1,Detalhes da tabela
28 | -- Olhe o campo numFiles
29 | describe detail db_festivaldemo.PatientInfoDelta
30 |
31 | -- COMMAND ----------
32 |
33 | -- DBTITLE 1,Quantidade de arquivos no Storage
34 | -- MAGIC %py
35 | -- MAGIC #note que temos 13 arquivos de dados, contudo no numFiles temos apenas 1, ou seja, esses 12 sao historico e podem ser limpos se na forem ser usados para Time Travel
36 | -- MAGIC len(dbutils.fs.ls('dbfs:/mnt/raw/database=covid/table=PatientInfoDelta'))
37 |
38 | -- COMMAND ----------
39 |
40 | -- DBTITLE 1,Historico
41 | -- Todas as alterações que fizemos com o Loop
42 | describe history db_festivaldemo.PatientInfoDelta
43 |
44 | -- COMMAND ----------
45 |
46 | -- DBTITLE 1,Usando a função: Chamando com Debug habilitado
47 | -- MAGIC %py
48 | -- MAGIC #Chamando a função instanciada no notebook
49 | -- MAGIC #Usando 2 colunas no ZORDER, apenas para exemplo
50 | -- MAGIC maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='sex,patient_id', vacuumRetention=144, vacuum=True, optimize=True, debug=True)
51 |
52 | -- COMMAND ----------
53 |
54 | -- DBTITLE 1,Usando a função: Executando
55 | -- MAGIC %py
56 | -- MAGIC #Chamando a função instanciada no notebook
57 | -- MAGIC #Usando 0 horas apenas para exemplo, o recomendado é 7 dias
58 | -- MAGIC maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='sex,patient_id', vacuumRetention=0, vacuum=True, optimize=True, debug=False)
59 |
60 | -- COMMAND ----------
61 |
62 | -- DBTITLE 1,Rodando sem HIVE
63 | -- MAGIC %py
64 | -- MAGIC #Execuando passando o caminho direto no Lake, defina o schema como delta e coloca o caminho entre `caminho`
65 | -- MAGIC maintenanceDeltalake(nomeSchema='delta', nomeTabela='`/mnt/raw/database=covid/table=PatientInfoDelta`', colunasZorder='sex,patient_id', vacuumRetention=144, vacuum=True, optimize=True, debug=False)
66 |
67 | -- COMMAND ----------
68 |
69 | -- MAGIC %py
70 | -- MAGIC #recontagem
71 | -- MAGIC len(dbutils.fs.ls('dbfs:/mnt/raw/database=covid/table=PatientInfoDelta'))
72 |
--------------------------------------------------------------------------------
/tips/regex/regex.py:
--------------------------------------------------------------------------------
1 | import re
2 | import csv
3 | from tabulate import tabulate
4 |
5 | log_string = """
6 | 04:06:51 1 of 25 START sql incremental model bronze.users [RUN]
7 | 04:06:51 2 of 25 START sql incremental model bronze.prices [RUN]
8 | 04:06:51 3 of 25 START sql incremental model bronze.vendors [RUN]
9 | 04:06:51 4 of 25 START sql table model bronze.customers [RUN]
10 | 04:06:58 3 of 25 OK created sql incremental model bronze.vendors [INSERT 0 2 in 6.70s]
11 | 04:06:58 5 of 25 START sql incremental model bronze.orders [RUN]
12 | 04:06:58 4 of 25 OK created sql table model bronze.customers [SELECT in 6.94s]
13 | 04:06:58 6 of 25 START sql incremental model bronze.teste [RUN]
14 | 04:07:00 2 of 25 OK created sql incremental model bronze.prices [INSERT 0 133 in 8.31s]
15 | 04:07:00 7 of 25 START sql table model bronze.email .............. [RUN]
16 | 04:07:06 1 of 25 OK created sql incremental model bronze.users [INSERT 0 178089 in 14.30s]
17 | 04:07:06 8 of 25 START sql view model bronze.sales [RUN]
18 | 04:07:10 5 of 25 OK created sql incremental model bronze.orders [INSERT 0 5 in 1200.90s]
19 | 04:07:10 9 of 25 START sql view model bronze.people [RUN]
20 | 04:07:13 8 of 25 OK created sql view model bronze.sales [CREATE VIEW in 74.74s]
21 | 04:07:13 10 of 25 START sql view model bronze.transfers ... [RUN]
22 | 04:07:18 9 of 25 OK created sql view model bronze.people [CREATE VIEW in 8.04s]
23 | 04:07:18 11 of 25 START sql view model bronze.employees [RUN]
24 | 04:07:21 10 of 25 OK created sql view model bronze.transfers [CREATE VIEW in 700.72s]
25 | 04:07:21 12 of 25 START sql incremental model bronze.undefined .. [RUN]
26 | 04:07:23 11 of 25 OK created sql view model bronze.employees [CREATE VIEW in 80.90s]
27 | """
28 |
29 | # Criando lista, quebrando por quebra de linha
30 | logs = log_string.split("\n")
31 |
32 | # Filtrando apenas eventos de finalização
33 | logs = list(filter(lambda x: "OK created" in x, logs))
34 |
35 | # Regra Regex para extrair informações necessárias
36 | pattern = r"(\d{2}:\d{2}:\d{2})\s+(\d+)\s+of\s+\d+\s+OK\s+created\s+sql\s+(\w+)\s+model\s+([\w\.]+)\s+.*?\[.*?in\s+(\d+\.\d+)s\]"
37 |
38 | # Criando um loop para processar cada log
39 | log_data = []
40 | for log in logs:
41 | match = re.search(pattern, log)
42 | if match:
43 | start_time = match.group(1)
44 | task_number = int(match.group(2))
45 | model_type = match.group(3)
46 | model_name = match.group(4)
47 | duration_seconds = float(match.group(5))
48 | duration_minutes = round(duration_seconds / 60,2)
49 |
50 | # Adicionando os dados à lista
51 | log_data.append([start_time, task_number, model_type, model_name, duration_seconds, duration_minutes])
52 | else:
53 | print("Log não corresponde ao padrão esperado.")
54 |
55 | # Ordenando pelo mais demorado
56 | log_data.sort(key=lambda x: x[4], reverse=True)
57 |
58 | # Printando na tela em formato tabular
59 | print(tabulate(log_data, headers=["start", "task", "type", "model", "duration_sec","duration_min"]))
60 |
61 | # Gerando CSV
62 | csv_file = "/Users/reginaldosilva/Downloads/log_data.csv"
63 | with open(csv_file, mode='w', newline='') as file:
64 | writer = csv.writer(file)
65 | writer.writerow(["start", "task", "type", "model", "duration_sec","duration_min"])
66 | writer.writerows(log_data)
--------------------------------------------------------------------------------
/tips/count/00000000000000000000.json:
--------------------------------------------------------------------------------
1 | {"commitInfo":{"timestamp":1683375359766,"userId":"8675301566931963","userName":"reginaldo.silva@dataside.com.br","operation":"CREATE OR REPLACE TABLE AS SELECT","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"notebook":{"notebookId":"2263512646416784"},"clusterId":"0213-212148-y5jr9wle","isolationLevel":"WriteSerializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"5165","numOutputBytes":"53856"},"engineInfo":"Databricks-Runtime/12.1.x-scala2.12","txnId":"705577b8-7af9-4bff-ba50-745fb31a9e10"}}
2 | {"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
3 | {"metaData":{"id":"8806a8ae-e6cb-41dc-8f63-b137f76f944e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"patient_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"sex\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"age\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"country\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"province\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"city\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"infection_case\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"infected_by\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"contact_number\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"symptom_onset_date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"confirmed_date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"released_date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"deceased_date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"state\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1683375358932}}
4 | {"add":{"path":"part-00000-dd7b4b44-cbd0-40ac-9549-e9fa424e2888-c000.snappy.parquet","partitionValues":{},"size":53856,"modificationTime":1683375359000,"dataChange":true,"stats":"{\"numRecords\":5165,\"minValues\":{\"patient_id\":\"1000000001\",\"sex\":\"female\",\"age\":\"0s\",\"country\":\"Bangladesh\",\"province\":\"Busan\",\"city\":\"Andong-si\",\"infection_case\":\"Anyang Gunpo Pastors Group\",\"infected_by\":\"1000000002\",\"contact_number\":\"-\",\"symptom_onset_date\":\" \",\"confirmed_date\":\"2020-01-20\",\"released_date\":\"2020-02-05\",\"deceased_date\":\"2020-02-19\",\"state\":\"deceased\"},\"maxValues\":{\"patient_id\":\"7000000019\",\"sex\":\"male\",\"age\":\"90s\",\"country\":\"Vietnam\",\"province\":\"Ulsan\",\"city\":\"sankyeock-dong\",\"infection_case\":\"overseas inflow\",\"infected_by\":\"7000000009\",\"contact_number\":\"95\",\"symptom_onset_date\":\"2020-06-28\",\"confirmed_date\":\"2020-06-30\",\"released_date\":\"2020-06-28\",\"deceased_date\":\"2020-05-25\",\"state\":\"released\"},\"nullCount\":{\"patient_id\":0,\"sex\":1122,\"age\":1380,\"country\":0,\"province\":0,\"city\":94,\"infection_case\":919,\"infected_by\":3819,\"contact_number\":4374,\"symptom_onset_date\":4475,\"confirmed_date\":3,\"released_date\":3578,\"deceased_date\":5099,\"state\":0}}","tags":{"INSERTION_TIME":"1683375359000000","MIN_INSERTION_TIME":"1683375359000000","MAX_INSERTION_TIME":"1683375359000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
5 |
--------------------------------------------------------------------------------
/tips/particionamento/Particionar ou Nao_.sql:
--------------------------------------------------------------------------------
1 | -- Databricks notebook source
2 | -- DBTITLE 1,Criando ambiente
3 | -- MAGIC %py
4 | -- MAGIC df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv')
5 | -- MAGIC df.count()
6 |
7 | -- COMMAND ----------
8 |
9 | -- DBTITLE 1,Exemplo dos dados
10 | -- MAGIC %py
11 | -- MAGIC df.display()
12 |
13 | -- COMMAND ----------
14 |
15 | -- DBTITLE 1,Gravando tabela particionada por pais
16 | -- MAGIC %py
17 | -- MAGIC df.write.format('parquet').mode('overwrite').partitionBy('Country').saveAsTable("db_demo.PatientInfoParquet_Country",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/table=PatientInfoParquet_Country')
18 |
19 | -- COMMAND ----------
20 |
21 | -- DBTITLE 1,Gravando tabela sem particionamento
22 | -- MAGIC %py
23 | -- MAGIC df.write.format('parquet').mode('overwrite').saveAsTable("db_demo.PatientInfoParquet_SemParticao",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/table=PatientInfoParquet_SemParticao')
24 |
25 | -- COMMAND ----------
26 |
27 | -- DBTITLE 1,Leitura com usando particionamento
28 | select * from db_demo.PatientInfoParquet_Country where country = 'Canada'
29 |
30 | -- COMMAND ----------
31 |
32 | -- DBTITLE 1,Leitura sem particionamento
33 | select * from db_demo.PatientInfoParquet_SemParticao where country = 'Canada'
34 |
35 | -- COMMAND ----------
36 |
37 | -- MAGIC %py
38 | -- MAGIC df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDeltaSemParticao",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/table=PatientInfoDeltaSemParticao')
39 |
40 | -- COMMAND ----------
41 |
42 | -- MAGIC %py
43 | -- MAGIC df.write.format('delta').mode('overwrite').partitionBy('Country').saveAsTable("db_demo.PatientInfoDeltaCountry",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/table=PatientInfoDeltaCountry')
44 |
45 | -- COMMAND ----------
46 |
47 | select * from db_demo.PatientInfoDeltaSemParticao where country = 'Canada'
48 |
49 | -- COMMAND ----------
50 |
51 | select * from db_demo.PatientInfoDeltaCountry where country = 'Canada'
52 |
53 | -- COMMAND ----------
54 |
55 | OPTIMIZE db_demo.PatientInfoDelta ZORDER BY (country)
56 |
57 | -- COMMAND ----------
58 |
59 | -- DBTITLE 1,Exemplo de particionamento por várias colunas
60 | -- MAGIC %py
61 | -- MAGIC df.write.format("delta").mode("overwrite").partitionBy(
62 | -- MAGIC "country", "province", "city", "sex"
63 | -- MAGIC ).saveAsTable(
64 | -- MAGIC "db_demo.PatientInfoDeltaParticionada",
65 | -- MAGIC path="abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/table=PatientInfoDeltaParticionada",
66 | -- MAGIC )
67 |
68 | -- COMMAND ----------
69 |
70 | select * from parquet.`abfss://reginaldo@stdts360.dfs.core.windows.net/part-00000-acd72083-0f7c-4f3e-85d2-07fc39aa714c.c000.snappy.parquet`
71 |
72 | -- COMMAND ----------
73 |
74 | select * from (
75 | select from_json(add.stats,'numRecords bigint').numRecords as numRecords,
76 | from_json(add.stats,'minValues struct').minValues.tickets_id as minValues,
77 | from_json(add.stats,'maxValues struct').maxValues.tickets_id as maxValues,
78 | add.path
79 | from json.`abfss://xxxx@xxxx.dfs.core.windows.net/xxxx/logs2/_delta_log/00000000000000000002.json`
80 | where add is not null
81 | ) tab where 22334863 between minValues and maxValues
82 | order by maxValues,minValues desc
83 |
--------------------------------------------------------------------------------
/tips/System Tables/ScriptSQL.sql:
--------------------------------------------------------------------------------
1 | -- Todas as tabelas do seu ambiente
2 | select * from system.information_schema.tables where table_owner <> 'System user';
3 |
4 | -- Todas as colunas de cada tabela
5 | select c.table_name,array_join(collect_set(column_name), ',') as columns from system.information_schema.columns c
6 | inner join system.information_schema.tables t on c.table_name = t.table_name and c.table_catalog = t.table_catalog
7 | where t.table_owner <> 'System user'
8 | group by all;
9 |
10 | -- Quantidade de tabelas por schema e catalog
11 | select table_catalog,table_schema,count(*) as qtdTables
12 | from system.information_schema.tables where table_owner <> 'System user'
13 | group by all;
14 |
15 | -- Auditoria do seu ambiente
16 | select * from system.access.audit order by event_time desc;
17 |
18 | -- Ultimo acesso nas suas tabelas
19 | select LastAccess.event_time,LastAccess.entity_type,LastAccess.created_by,* from system.information_schema.tables a
20 | LEFT JOIN
21 | LATERAL (select max(b.event_time) as event_time, LAST(b.entity_type) as entity_type, LAST(b.created_by) as created_by
22 | from system.access.table_lineage b where b.target_table_name = a.table_name) as LastAccess
23 | where a.table_owner <> 'System user';
24 |
25 | -- Quem acessou sua tabela e quando?
26 | select * from system.access.table_lineage where target_table_name = 'tbordersliquid'
27 | order by event_time desc;
28 |
29 | -- Todos os clusters do ambiente
30 | select cluster_source,count(*) as qtd from system.compute.clusters
31 | group by all;
32 |
33 | -- Clusters All Purpose
34 | select * from system.compute.clusters where cluster_source = 'UI';
35 |
36 | -- Job Clusters mais custosos
37 | SELECT usage_metadata.job_id as `Job ID`, sum(usage_quantity) as `DBUs`
38 | FROM system.billing.usage
39 | WHERE usage_metadata.job_id IS NOT NULL
40 | GROUP BY `Job ID`
41 | ORDER BY `DBUs` DESC;
42 |
43 | -- Cluster mais custoso
44 | select b.cluster_name, sum(usage_quantity) as `DBUs Consumed` from system.billing.usage a
45 | inner join system.compute.clusters b on a.usage_metadata.cluster_id = b.cluster_id
46 | where usage_metadata.cluster_id is not null
47 | group by all
48 | order by 2 desc;
49 |
50 | -- Cluster All purpose mais custoso
51 | select usage_date as `Date`, sum(usage_quantity) as `DBUs Consumed` from system.billing.usage a
52 | inner join system.compute.clusters b on a.usage_metadata.cluster_id = b.cluster_id
53 | where usage_metadata.cluster_id is not null
54 | group by all
55 | order by 1 desc;
56 |
57 |
58 | -- Cluster mais custoso em USD
59 | select b.cluster_name, sum(usage_quantity) as `DBUs Consumed`, (sum(usage_quantity) * max(c.pricing.default)) as TotalUSD
60 | from system.billing.usage a
61 | inner join system.compute.clusters b on a.usage_metadata.cluster_id = b.cluster_id
62 | inner join system.billing.list_prices c on c.sku_name = a.sku_name
63 | where usage_metadata.cluster_id is not null
64 | and usage_start_time between '2023-11-01' and '2023-11-30'
65 | group by all
66 | order by 3 desc;
67 |
68 |
69 | -- total em USD por mês
70 | select month(usage_end_time) as mes,sum(usage_quantity) as `DBUs Consumed`, (sum(usage_quantity) * max(c.pricing.default)) as TotalUSD
71 | from system.billing.usage a
72 | inner join system.billing.list_prices c on c.sku_name = a.sku_name
73 | group by all
74 | order by 1 desc
75 |
76 | -- Execuções do PREDICTIVE OPTIMIZATION
77 | select * from system.storage.predictive_optimization_operations_history;
78 |
--------------------------------------------------------------------------------
/tips/VacuumInventory/Vacuum Inventory.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Variables
3 | from datetime import datetime, timedelta
4 | ## Script for Azure Databricks
5 | ## For AWS and GCP you need customize some code blocks
6 | ## If you are not using Unity Catalog, use as catalog name: hive_metastore
7 | ## Author: Reginaldo Silva
8 |
9 | ##########################
10 | ## Set Variables
11 | ##########################
12 | storageName = 'sahierarchicaldatalake'
13 | dataBucket = 'datalake'
14 | inventoryBucket = 'inventory'
15 |
16 | inventoryCatalogName = 'dev'
17 | inventoryDabaseName = 'datainaction'
18 | inventoryTableName = 'vacuumInventory'
19 | ##########################
20 |
21 | current_day = datetime.now().strftime('%d')
22 | current_month = datetime.now().strftime('%m')
23 | current_year = datetime.now().year
24 | dataStoragePath = f'abfss://{dataBucket}@{storageName}.dfs.core.windows.net/'
25 |
26 | try:
27 | dbutils.fs.ls(f'abfss://{inventoryBucket}@{storageName}.dfs.core.windows.net/{current_year}/{current_month}/{current_day}/')
28 | except:
29 | print('No files found using current day, trying D-1...')
30 | try:
31 | current_day = (datetime.today() + timedelta(days=-1)).strftime('%d')
32 | dbutils.fs.ls(f'abfss://{inventoryBucket}@{storageName}.dfs.core.windows.net/{current_year}/{current_month}/{current_day}/')
33 | print('Using D-1!')
34 | except:
35 | print('No files found!')
36 | dbutils.notebook.exit('No files found in inventory folder!')
37 |
38 | inventoryStoragePath = f'abfss://{inventoryBucket}@{storageName}.dfs.core.windows.net/{current_year}/{current_month}/{current_day}/*/*/*.parquet'
39 |
40 | print('Inventory Storage path: ', inventoryStoragePath)
41 | print('Data Storage path: ', dataStoragePath)
42 | print(f'Inventory Table: {inventoryCatalogName}.{inventoryDabaseName}.{inventoryTableName}')
43 |
44 | # COMMAND ----------
45 |
46 | # DBTITLE 1,Create Inventory Table
47 | spark.sql(f"""
48 | CREATE TABLE IF NOT EXISTS {inventoryCatalogName}.{inventoryDabaseName}.{inventoryTableName}
49 | (
50 | path string,
51 | `creationTime` long,
52 | `modificationTime` long,
53 | `length` long,
54 | `isDir` boolean,
55 | `LastAccessTime` string,
56 | SourceFileName string not null,
57 | datetimeLoad timestamp
58 | );
59 | """
60 | )
61 |
62 | # COMMAND ----------
63 |
64 | # DBTITLE 1,Clean data
65 | # Clean inventory table, we will load new updated data
66 | spark.sql(f"""
67 | Truncate table {inventoryCatalogName}.{inventoryDabaseName}.{inventoryTableName}
68 | """
69 | )
70 |
71 | # COMMAND ----------
72 |
73 | # DBTITLE 1,INSERT INTO inventory table
74 | # Insert new inventory Data to delta table
75 | # Get just the necessary fields
76 | # Field hdi_isfolder is another Option to achive isDir field
77 | spark.sql(f"""
78 | INSERT INTO {inventoryCatalogName}.{inventoryDabaseName}.{inventoryTableName}
79 | select
80 | concat('{dataStoragePath}',replace(name,'{dataBucket}/','')) as path,
81 | `Creation-Time` creationTime,
82 | `Last-Modified` modificationTime,
83 | `Content-Length` as length,
84 | case when `Content-Length` > 0 then false else true end isDir,
85 | cast(from_unixtime(`LastAccessTime` / 1000) as string) LastAccessTime,
86 | _metadata.file_name as SourceFileName,
87 | current_timestamp as datetimeLoad
88 | from
89 | parquet.`{inventoryStoragePath}`
90 | """
91 | ).display()
92 |
93 | # COMMAND ----------
94 |
95 | # MAGIC %md
96 | # MAGIC > Example
97 | # MAGIC >> **vacuum catalog.database.tableName using inventory (select path, length, isDir, modificationTime from dev.datainaction.vacuumInventory) RETAIN 48 HOURS**
98 |
--------------------------------------------------------------------------------
/tips/SHOW/SHOW COMMANDs.sql:
--------------------------------------------------------------------------------
1 | -- Databricks notebook source
2 | SHOW CATALOGS
3 |
4 | -- COMMAND ----------
5 |
6 | -- Databases = Schemas
7 | SHOW SCHEMAS FROM DEV
8 |
9 | -- COMMAND ----------
10 |
11 | SHOW TABLES FROM DEV.db_demo
12 |
13 | -- COMMAND ----------
14 |
15 | USE CATALOG DEV;
16 | USE SCHEMA db_demo;
17 | SHOW TABLE EXTENDED LIKE 'tb*';
18 |
19 | -- COMMAND ----------
20 |
21 | DROP TABLE testeuc2
22 |
23 | -- COMMAND ----------
24 |
25 | SHOW TABLES DROPPED IN db_demo
26 |
27 | -- COMMAND ----------
28 |
29 | UNDROP table DEV.db_demo.testeuc2
30 |
31 | -- COMMAND ----------
32 |
33 | ALTER TABLE DEV.db_demo.tbordersliquid SET TBLPROPERTIES ('delta.deletedFileRetentionDuration' = '1 days');
34 |
35 | -- COMMAND ----------
36 |
37 | SHOW TBLPROPERTIES DEV.db_demo.tbordersliquid
38 |
39 | -- COMMAND ----------
40 |
41 | SHOW COLUMNS FROM DEV.db_demo.tbordersliquid
42 |
43 | -- COMMAND ----------
44 |
45 | -- MAGIC %py
46 | -- MAGIC listcolunas = ''
47 | -- MAGIC list = spark.sql('SHOW COLUMNS FROM DEV.db_demo.tbordersliquid').collect()
48 | -- MAGIC
49 | -- MAGIC print(listcolunas)
50 |
51 | -- COMMAND ----------
52 |
53 | -- MAGIC %py
54 | -- MAGIC listcolunas = ','.join(str(col.col_name) for col in spark.sql('SHOW COLUMNS FROM DEV.db_demo.tbordersliquid').collect())
55 | -- MAGIC print(listcolunas)
56 |
57 | -- COMMAND ----------
58 |
59 | SHOW CREATE TABLE DEV.db_demo.tbordersliquid
60 |
61 | -- COMMAND ----------
62 |
63 | SHOW PARTITIONS DEV.db_demo.tborderspartition
64 |
65 | -- COMMAND ----------
66 |
67 | SHOW USERS
68 |
69 | -- COMMAND ----------
70 |
71 | SHOW USERS LIKE '*dataside*'
72 |
73 | -- COMMAND ----------
74 |
75 | SHOW GROUPS
76 |
77 | -- COMMAND ----------
78 |
79 | SHOW GROUPS WITH USER `reginaldo.silva@dataside.com.br`;
80 |
81 | -- COMMAND ----------
82 |
83 | SHOW GROUPS WITH GROUP `read_write_prod`;
84 |
85 | -- COMMAND ----------
86 |
87 | USE CATALOG DEV
88 |
89 | -- COMMAND ----------
90 |
91 | -- MAGIC %py
92 | -- MAGIC from pyspark.sql.functions import lit
93 | -- MAGIC from datetime import datetime
94 | -- MAGIC countTBOk = 1
95 | -- MAGIC countError = 0
96 | -- MAGIC countTotal = 1
97 | -- MAGIC for db in spark.sql("show databases").collect():
98 | -- MAGIC print('>>>>>>>> iniciando DB: ',db.databaseName)
99 | -- MAGIC for tb in spark.sql(f"show tables from {db.databaseName}").collect():
100 | -- MAGIC try:
101 | -- MAGIC countTotal = countTotal + 1
102 | -- MAGIC print(countTotal,' - ',db.databaseName,'.',tb.tableName)
103 | -- MAGIC spark.sql(f"select * from {db.databaseName}.{tb.tableName} limit 1")
104 | -- MAGIC countTBOk = countTBOk + 1
105 | -- MAGIC except Exception as error:
106 | -- MAGIC print("#######error ocurred on: ", db.databaseName,'.',tb.tableName, error)
107 | -- MAGIC countError = countError + 1
108 | -- MAGIC print ('------Quantidade de erros:', countError)
109 | -- MAGIC
110 | -- MAGIC print('Tabelas OK: ', countTBOk)
111 | -- MAGIC print('Tabelas com Erro: ', countError)
112 | -- MAGIC print('Total tabelas: ', countTotal)
113 |
114 | -- COMMAND ----------
115 |
116 | -- MAGIC %py
117 | -- MAGIC from pyspark.sql.functions import lit
118 | -- MAGIC from datetime import datetime
119 | -- MAGIC countTotal = 0
120 | -- MAGIC for db in spark.sql("show databases").collect():
121 | -- MAGIC print('>>>>>>>> iniciando DB: ',db.databaseName)
122 | -- MAGIC for tb in spark.sql(f"show tables from {db.databaseName}").collect():
123 | -- MAGIC try:
124 | -- MAGIC countTotal = countTotal + 1
125 | -- MAGIC print(countTotal,' - ',str(db.databaseName).replace(' ',''),'.',str(tb.tableName).replace(' ',''))
126 | -- MAGIC listcolunas = ','.join(str(col.col_name) for col in spark.sql(f"""SHOW COLUMNS FROM {db.databaseName.replace(' ','')}.{tb.tableName} """).collect())
127 | -- MAGIC print('->>> TableName: ',db.databaseName,'.',tb.tableName, ' ->>> List Cols: ',listcolunas)
128 | -- MAGIC except Exception as error:
129 | -- MAGIC print("#######error ocurred on: ", db.databaseName,'.',tb.tableName, error)
130 |
--------------------------------------------------------------------------------
/tips/DatabricksServicePrincipal/Generate ServicePrincipal Token.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Criando uma ServicePrincipal e colocando em um grupo especifico
3 | # MAGIC %sh
4 | # MAGIC curl --netrc -X POST \
5 | # MAGIC https://adb-4013955633331914.14.azuredatabricks.net/api/2.0/preview/scim/v2/ServicePrincipals \
6 | # MAGIC --header 'Content-type: application/scim+json' \
7 | # MAGIC --header 'Authorization: Bearer xxx-3' \
8 | # MAGIC --data '{"schemas": ["urn:ietf:params:scim:schemas:core:2.0:ServicePrincipal"],"applicationId": "96a9c13a-bd04-459f-a186-e36fe24b6c9a","displayName": "databricks-serviceprincipal","groups": [{"value": "612835559850353"}],"entitlements": [{ "value": "allow-cluster-create"}], "active": true}'
9 |
10 | # COMMAND ----------
11 |
12 | # DBTITLE 1,Resgatando o GroupID
13 | ##prod
14 | from requests import request
15 | from pyspark.sql.functions import *
16 | import requests
17 | import json
18 |
19 | instance_id = 'adb-4013955633331914.14.azuredatabricks.net'
20 |
21 | api_version = '/api/2.0'
22 | api_command = '/preview/scim/v2/Groups'
23 | url = f"https://{instance_id}{api_version}{api_command}"
24 |
25 | #Adicionar secret
26 | headers = {
27 | 'Authorization': "Bearer xxxx-3"
28 | }
29 |
30 | response = requests.get(
31 | url = url,
32 | headers=headers
33 | )
34 |
35 | jsonDataList = []
36 | jsonDataList.append(json.dumps(json.loads(response.text), indent = 2))
37 | jsonRDD = sc.parallelize(jsonDataList)
38 | dfGroups = spark.read.option('multiline', 'true').option('inferSchema', 'true').json(jsonRDD)
39 | dfExplode = dfGroups.withColumn("Groups",explode(dfGroups.Resources))
40 | dfExplode.select(dfExplode.Groups.id,dfExplode.Groups.displayName).display()
41 |
42 | # COMMAND ----------
43 |
44 | # DBTITLE 1,List ServicePrincipal
45 | # MAGIC %sh
46 | # MAGIC curl -X GET \
47 | # MAGIC https://adb-4013955633331914.14.azuredatabricks.net/api/2.0/preview/scim/v2/ServicePrincipals \
48 | # MAGIC --header "Authorization: Bearer xxxx-3"
49 |
50 | # COMMAND ----------
51 |
52 | # DBTITLE 1,Genarate a short-live token - Use this token to generate a Databricks PAT for an app
53 | # MAGIC %sh
54 | # MAGIC curl -X POST -H 'Content-Type: application/x-www-form-urlencoded' \
55 | # MAGIC https://login.microsoftonline.com/[TenantID]/oauth2/v2.0/token \
56 | # MAGIC -d 'client_id=96a9c13a-bd04-459f-a186-e36fe24b6c9a' \
57 | # MAGIC -d 'grant_type=client_credentials' \
58 | # MAGIC -d 'scope=2ff814a6-3304-4ab8-85cb-cd0e6f879c1d%2F.default' \
59 | # MAGIC -d 'client_secret=[App Secret]'
60 |
61 | # COMMAND ----------
62 |
63 | # DBTITLE 1,Create Token for ServicePrincipal - Use short-live token to authenticate
64 | # MAGIC %sh
65 | # MAGIC curl -X POST \
66 | # MAGIC https://adb-4013955633331914.14.azuredatabricks.net/api/2.0/token/create \
67 | # MAGIC --header "Content-type: application/json" \
68 | # MAGIC --header "Authorization: Bearer [token]" \
69 | # MAGIC --data '{"application_id": "96a9c13a-bd04-459f-a186-e36fe24b6c9a","comment": "Token para acesso no PowerBI, token não expira","lifetime_seconds": -1}'
70 |
71 | # COMMAND ----------
72 |
73 | # DBTITLE 1,Token List - Use App Token
74 | ##prod
75 | from pyspark.sql.functions import *
76 | from requests import request
77 | import requests
78 | import json
79 |
80 | instance_id = 'adb-4013955633331914.14.azuredatabricks.net'
81 |
82 | api_version = '/api/2.0'
83 | api_command = '/token/list'
84 | url = f"https://{instance_id}{api_version}{api_command}"
85 |
86 | #Adicionar secret
87 | headers = {
88 | 'Authorization': "Bearer xxx-3"
89 | }
90 |
91 | response = requests.get(
92 | url = url,
93 | headers=headers
94 | )
95 |
96 | jsonDataList = []
97 | jsonDataList.append(json.dumps(json.loads(response.text), indent = 2))
98 | jsonRDD = sc.parallelize(jsonDataList)
99 | dfGroups = spark.read.option('multiline', 'true').option('inferSchema', 'true').json(jsonRDD)
100 | print(json.dumps(json.loads(response.text), indent = 2))
101 |
102 | # COMMAND ----------
103 |
104 | # DBTITLE 1,Test Resquest API with ServicePrincipal Token
105 | # MAGIC %sh
106 | # MAGIC curl -X GET \
107 | # MAGIC -H 'Authorization: Bearer xxx-3' \
108 | # MAGIC https://adb-4013955633331914.14.azuredatabricks.net/api/2.0/clusters/list
109 |
--------------------------------------------------------------------------------
/tips/logicapp/logicapp.json:
--------------------------------------------------------------------------------
1 | {
2 | "definition": {
3 | "$schema": "https://schema.management.azure.com/providers/Microsoft.Logic/schemas/2016-06-01/workflowdefinition.json#",
4 | "actions": {
5 | "HTTP_-_Run_Databricks_Job": {
6 | "inputs": {
7 | "body": {
8 | "job_id": "@triggerBody()?['jobId']"
9 | },
10 | "headers": {
11 | "Authorization": "Bearer xxxxx"
12 | },
13 | "method": "POST",
14 | "uri": "@triggerBody()?['databricksWorkspace']"
15 | },
16 | "runAfter": {},
17 | "type": "Http"
18 | },
19 | "Send_Email_Notification": {
20 | "inputs": {
21 | "body": {
22 | "Body": "Abaixo detalhes da execução:
\n
\n@{triggerBody()?['customBody']}
\n
\nJobName:@{triggerBody()?['jobName']}
\nDatabricksAPI:@{triggerBody()?['databricksWorkspace']}
\nDate: @{triggerBody()?['dateLog']}
\nResult:@{body('HTTP_-_Run_Databricks_Job')}
",
23 | "Importance": "High",
24 | "Subject": "@triggerBody()?['subject']",
25 | "To": "@triggerBody()?['emailList']"
26 | },
27 | "host": {
28 | "connection": {
29 | "name": "@parameters('$connections')['office365']['connectionId']"
30 | }
31 | },
32 | "method": "post",
33 | "path": "/v2/Mail"
34 | },
35 | "runAfter": {
36 | "HTTP_-_Run_Databricks_Job": [
37 | "Succeeded"
38 | ]
39 | },
40 | "type": "ApiConnection"
41 | }
42 | },
43 | "contentVersion": "1.0.0.0",
44 | "outputs": {},
45 | "parameters": {
46 | "$connections": {
47 | "defaultValue": {},
48 | "type": "Object"
49 | }
50 | },
51 | "triggers": {
52 | "Events_Monitor_": {
53 | "evaluatedRecurrence": {
54 | "frequency": "Minute",
55 | "interval": 1
56 | },
57 | "inputs": {
58 | "host": {
59 | "connection": {
60 | "name": "@parameters('$connections')['sql_3']['connectionId']"
61 | }
62 | },
63 | "method": "get",
64 | "path": "/v2/datasets/@{encodeURIComponent(encodeURIComponent('default'))},@{encodeURIComponent(encodeURIComponent('default'))}/tables/@{encodeURIComponent(encodeURIComponent('tb_OrchestratorEvents'))}/onnewitems"
65 | },
66 | "recurrence": {
67 | "frequency": "Minute",
68 | "interval": 1
69 | },
70 | "splitOn": "@triggerBody()?['value']",
71 | "type": "ApiConnection"
72 | }
73 | }
74 | },
75 | "parameters": {
76 | "$connections": {
77 | "value": {
78 | "office365": {
79 | "connectionId": "/subscriptions/b71883c3-c463-4eb2-b54a-d7eece44d276/resourceGroups/rgDatabricks/providers/Microsoft.Web/connections/office365-2",
80 | "connectionName": "office365-2",
81 | "id": "/subscriptions/b71883c3-c463-4eb2-b54a-d7eece44d276/providers/Microsoft.Web/locations/eastus/managedApis/office365"
82 | },
83 | "sql_3": {
84 | "connectionId": "/subscriptions/b71883c3-c463-4eb2-b54a-d7eece44d276/resourceGroups/rgDatabricks/providers/Microsoft.Web/connections/sql-10",
85 | "connectionName": "sql-10",
86 | "id": "/subscriptions/b71883c3-c463-4eb2-b54a-d7eece44d276/providers/Microsoft.Web/locations/eastus/managedApis/sql"
87 | }
88 | }
89 | }
90 | }
91 | }
--------------------------------------------------------------------------------
/tips/UpgradeMethods/README.md:
--------------------------------------------------------------------------------
1 | ##Tabela de migração: Estrategias por tipo de tabela
2 |
3 | | Id | Tipo HMS | Location | Tipo UC | Método |
4 | |----|----------|----------------|--------------------|--------------------------|
5 | | 1 | Managed | DBFS Root | Managed/External | CTAS / DEEP CLONE |
6 | | 2 | Managed | DBFS Root | Managed/External | CTAS / DEEP CLONE |
7 | | 3 | Hive SerDe | DBFS Root | Managed/External | CTAS / DEEP CLONE |
8 | | 4 | Managed | Mount | External | SYNC com Convert |
9 | | 5 | Managed | Mount | Managed | CTAS / DEEP CLONE |
10 | | 6 | External | Mount | External | SYNC |
11 | | 7 | External | Mount | Managed | CTAS / DEEP CLONE |
12 | | 8 | Managed | Cloud Storage | External | SYNC com Convert |
13 | | 9 | Managed | Cloud Storage | Managed | CTAS / DEEP CLONE |
14 | | 10 | External | Cloud Storage | External | SYNC |
15 | | 11 | External | Cloud Storage | Managed | CTAS / DEEP CLONE |
16 |
17 | ## Observação importante
18 | - **set spark.databricks.sync.command.enableManagedTable=true;**
19 | - Ao usar essa opção, você não pode dropar a tabela no HMS, pois, o dados serão excluídos do Storage
20 | - Caso queira dropar, use o script Scala para trocar ela de Managed para External
21 | - Outra dica, após a migração das suas tabelas do HMS para o UC, caso você não drop elas no HMS, voce pode usar essa opção para evitar que alguém escreva nelas, principalmente se forem Managed.
22 |
23 | ## Tabelas Managed vs External
24 |
25 | - **Tabelas Managed**:
26 | - Dados e metadados são gerenciados pelo Unity Catalog.
27 | - Os dados são armazenados no local especificado pelo catálogo Unity (tipicamente em armazenamento cloud).
28 | - A exclusão de uma tabela managed remove também os dados.
29 | - Se for HMS os dados são removidos imediatamente
30 | - Se for no UC os dados são mantidos por mais 30 dias
31 | - Aqui voce pode usar o UNDROP até 7 dias
32 |
33 | - **Tabelas External**:
34 | - Apenas os metadados são gerenciados pelo Unity Catalog, os dados permanecem no armazenamento externo (geralmente em um bucket ou outro recurso cloud).
35 | - A exclusão de uma tabela external remove apenas os metadados; os dados permanecem no armazenamento original.
36 | - Permite que os dados sejam compartilhados entre diferentes sistemas ou aplicações.
37 |
38 | ### DBFS Root vs Mount vs Cloud Storage
39 |
40 | - **DBFS Root**:
41 | - O sistema de arquivos distribuído do Databricks (Databricks File System).
42 | - Armazenamento temporário e volátil, com possíveis limitações em operações de longa duração.
43 | - Os dados ficam fisicamente no storage da Databricks que voce não tem acesso
44 |
45 | - **Mount**:
46 | - Uma forma de acessar o armazenamento externo (como S3, ADLS) no DBFS como se fosse um diretório local.
47 | - Os dados permanecem no armazenamento externo, mas podem ser acessados dentro de Databricks via caminhos montados.
48 |
49 | - **Cloud Storage**:
50 | - Armazenamento na nuvem (ex: AWS S3, Azure Data Lake, Google Cloud Storage) onde os dados podem ser armazenados e acessados diretamente.
51 | - Mais flexível para armazenamento de grande volume e soluções a longo prazo.
52 |
53 | ### Métodos CTAS, DEEP CLONE e SYNC
54 |
55 | - **CTAS (Create Table As Select)**:
56 | - Método usado para criar uma nova tabela a partir dos resultados de uma consulta SQL.
57 | - A nova tabela pode ser criada com dados agregados ou filtrados.
58 | - Exemplo de uso: `CREATE TABLE nova_tabela AS SELECT * FROM tabela_existente WHERE condição`.
59 |
60 | - **DEEP CLONE**:
61 | - Método utilizado para clonar tabelas, incluindo seus dados, metadados e histórico de transações.
62 | - Utilizado para cópia rápida de tabelas, útil em cenários de backup ou migração.
63 | - Exemplo: `DEEP CLONE origem DESTINO` cria uma cópia completa da tabela de origem.
64 |
65 | - **SYNC**:
66 | - Sincroniza tabelas external com o Unity Catalog, garantindo que o catálogo reflita as alterações feitas diretamente no armazenamento.
67 | - Essencial para manter a consistência entre os metadados no Unity Catalog e o armazenamento externo.
68 | - Útil para cenários onde os dados podem ser alterados por fora do Databricks.
69 |
70 |
71 | Post Databricks:
72 | https://www.databricks.com/blog/migrating-tables-hive-metastore-unity-catalog-metastore#appendix
73 |
74 | Notebook oficial:
75 | https://notebooks.databricks.com/notebooks/uc-upgrade-scenario-with-examples-for-blog.dbc?_gl=1*1nrxwtq*_gcl_au*OTUxMzE5NDg3LjE2OTM0NjcxNDM.
76 |
--------------------------------------------------------------------------------
/API/databricks/Databricks API - Clusters.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Listando todos os clusters Databricks
3 | import requests
4 | import json
5 |
6 | ## O seu databricks Instance voce encontra na sua URL.
7 | instance_id = 'xxxxxxx.azuredatabricks.net'
8 |
9 | # Aqui estamos usando a API na versão 2.0 que é a mais recente no momento
10 | api_version = '/api/2.0'
11 | api_command = '/clusters/list'
12 | url = f"https://{instance_id}{api_version}{api_command}"
13 | print(url)
14 |
15 | headers = {
16 | 'Authorization': "Bearer xxxxxxx" ## put your databricks token here
17 | }
18 |
19 | response = requests.get(
20 | url = url,
21 | headers=headers
22 | )
23 |
24 | # Transformando nosso retorno em um Dataframe
25 | jsonDataList = []
26 | jsonDataList.append(json.dumps(json.loads(response.text), indent = 2))
27 | jsonRDD = sc.parallelize(jsonDataList)
28 | df = spark.read.option('multiline', 'true').option('inferSchema', 'true').json(jsonRDD)
29 |
30 | # COMMAND ----------
31 |
32 | # DBTITLE 1,Print em modo Text
33 | print(response.text)
34 |
35 | # COMMAND ----------
36 |
37 | # DBTITLE 1,Print bonito
38 | print(json.dumps(json.loads(response.text), indent = 2))
39 |
40 | # COMMAND ----------
41 |
42 | # DBTITLE 1,Expandindo itens
43 | from pyspark.sql.functions import *
44 | # Usando o Explode para expandir nosso Json.
45 | dfclusters = df.select(explode("clusters").alias("cl"))
46 | dfclusters.display()
47 |
48 | # COMMAND ----------
49 |
50 | # DBTITLE 1,Selecionando os campos relevantes
51 | dfclusters.select(
52 | dfclusters.cl.cluster_id,
53 | dfclusters.cl.cluster_name,
54 | dfclusters.cl.cluster_cores,
55 | dfclusters.cl.cluster_memory_mb,
56 | dfclusters.cl.state,
57 | dfclusters.cl.spark_conf,
58 | dfclusters.cl.cluster_source.alias("cluster_source"),
59 | dfclusters.cl.creator_user_name,
60 | dfclusters.cl.autotermination_minutes,
61 | dfclusters.cl.azure_attributes,
62 | dfclusters.cl.autoscale,
63 | dfclusters.cl.custom_tags,
64 | dfclusters.cl.default_tags,
65 | dfclusters.cl.driver,
66 | dfclusters.cl.driver_instance_source,
67 | dfclusters.cl.driver_node_type_id,
68 | dfclusters.cl.node_type_id,
69 | dfclusters.cl.effective_spark_version.alias("effective_spark_version"),
70 | dfclusters.cl.enable_elastic_disk,
71 | dfclusters.cl.last_restarted_time,
72 | dfclusters.cl.last_state_loss_time,
73 | dfclusters.cl.num_workers,
74 | dfclusters.cl.runtime_engine.alias("runtime_engine"),
75 | dfclusters.cl.spark_conf,
76 | dfclusters.cl.start_time,
77 | dfclusters.cl.state,
78 | dfclusters.cl.state_message,
79 | dfclusters.cl.terminated_time,
80 | dfclusters.cl.termination_reason
81 | ).createOrReplaceTempView('vw_clusters')
82 |
83 | # COMMAND ----------
84 |
85 | # DBTITLE 1,Consultando com SQL
86 | # MAGIC %sql
87 | # MAGIC -- Para os amantes de SQL
88 | # MAGIC select * from vw_clusters
89 |
90 | # COMMAND ----------
91 |
92 | # DBTITLE 1,Agrupando por versão e origem
93 | # MAGIC %sql
94 | # MAGIC select cluster_source,effective_spark_version,count(*) as qtdClusters from vw_clusters
95 | # MAGIC group by cluster_source,effective_spark_version
96 | # MAGIC order by cluster_source,effective_spark_version
97 |
98 | # COMMAND ----------
99 |
100 | # DBTITLE 1,Criando um novo cluster via API
101 | # Aqui estamos usando a API na versão 2.0 que é a mais recente no momento
102 | api_version = '/api/2.0'
103 | api_command = '/clusters/create'
104 | url = f"https://{instance_id}{api_version}{api_command}"
105 | print(url)
106 |
107 | headers = {
108 | 'Authorization': "Bearer xxxxxx-2" ## put your databricks token here
109 | }
110 |
111 | datajson = {
112 | "cluster_name": "my-cluster-api",
113 | "spark_version": "11.3.x-scala2.12",
114 | "node_type_id": "Standard_D3_v2",
115 | "spark_conf": {
116 | "spark.speculation": True
117 | },
118 | "num_workers": 1
119 | }
120 |
121 | print(json.dumps(datajson, indent = 2))
122 | data = json.dumps(datajson, indent = 2)
123 | response = requests.post(url = url, headers = headers, data = data)
124 | print(response)
125 |
126 | # COMMAND ----------
127 |
128 | # DBTITLE 1,Deletando um cluster via API
129 | # Aqui estamos usando a API na versão 2.0 que é a mais recente no momento
130 | api_version = '/api/2.0'
131 | api_command = '/clusters/delete'
132 | url = f"https://{instance_id}{api_version}{api_command}"
133 | print(url)
134 |
135 | headers = {
136 | 'Authorization': "Bearer xxxxxx" ## put your databricks token here
137 | }
138 |
139 | datajson = {"cluster_id": "0211-131904-kvyksq3e"}
140 |
141 | print(json.dumps(datajson, indent = 2))
142 | data = json.dumps(datajson, indent = 2)
143 | response = requests.post(url = url, headers = headers, data = data)
144 | print(response)
145 |
146 | # COMMAND ----------
147 |
148 | # DBTITLE 1,Salvando como tabela Delta
149 | ## Adicione o caminho do storage
150 | caminhoDatalakeLog = '[]'
151 | df = spark.sql("select * from vw_clusters")
152 | df.write.option("mergeSchema", "true").mode(f"overwrite").format("delta").save(f"{caminhoDatalakeLog}")
153 | spark.sql(f"Create Table if not exists [nome do seu banco de dados].monitoramento_clusters Using Delta Location '{caminhoDatalakeLog}'")
154 |
--------------------------------------------------------------------------------
/routines/OptimizeAndVacuum/OptimizeAndVacuum.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md
3 | # MAGIC Descriçao dos parametros
4 | # MAGIC
5 | # MAGIC | Parametro | Descrição | Tipo
6 | # MAGIC | ------------- | ------------- | ------------- |
7 | # MAGIC | nomeSchema | Nome do Database onde a tabela está criada | string |
8 | # MAGIC | nomeTabela | Nome da tabela que será aplicado a manutenção | string |
9 | # MAGIC | vacuum | True: Vacuum será executado, False: Pula vacuum | bool |
10 | # MAGIC | optimize | True: OPTIMIZE será executado, False: Pula OPTIMIZE | bool |
11 | # MAGIC | colunasZorder | Se informado e optimize for igual a True, aplicada Zorder na lista de colunas separado por vírgula (,) | string |
12 | # MAGIC | vacuumRetention | Quantidade de horas que será retida após execucao do Vacuum | integer |
13 | # MAGIC | Debug | Apenas imprime o resultado na tela | bool |
14 | # MAGIC
15 | # MAGIC Exemplos:
16 | # MAGIC
17 | # MAGIC #### --> Primeiro instanciar a Function <--
18 | # MAGIC `` %run /Users/reginaldo.silva@dataside.com.br/OptimizeAndVacuum ``
19 | # MAGIC
20 | # MAGIC #### --> Executando VACUUM com retenção de 72 horas e OPTMIZE SEM ZORDER <--
21 | # MAGIC ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='funcionario', colunasZorder='none', vacuumRetention=72, vacuum=True, optimize=True, debug=False)``
22 | # MAGIC
23 | # MAGIC #### --> Executando VACUUM retenção padrão e OPTMIZE COM ZORDER <--
24 | # MAGIC ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='patient_id', vacuumRetention=168, vacuum=True, optimize=True, debug=False)``
25 | # MAGIC
26 | # MAGIC #### --> Executando somente VACUUM <--
27 | # MAGIC ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=False, debug=False)``
28 | # MAGIC
29 | # MAGIC #### --> Executando somente OPTMIZE <--
30 | # MAGIC ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=False, optimize=True, debug=False)``
31 | # MAGIC
32 | # MAGIC #### --> Modo Debug - Apenas print <--
33 | # MAGIC ``maintenanceDeltalake(nomeSchema='db_festivaldemo', nomeTabela='PatientInfoDelta', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=True, debug=True)``
34 | # MAGIC
35 | # MAGIC ``Criado por: Reginaldo Silva``
36 | # MAGIC - [Blog Data In Action](https://datainaction.dev/)
37 | # MAGIC - [Github](https://github.com/reginaldosilva27)
38 | # MAGIC
39 | # MAGIC ``Referencias:``
40 | # MAGIC -
41 | # MAGIC -
42 |
43 | # COMMAND ----------
44 |
45 | from datetime import datetime
46 | def maintenanceDeltalake (nomeSchema='silver', nomeTabela='none', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=True, debug=True):
47 | if debug:
48 | print("Modo Debug habilitado!")
49 | if optimize:
50 | if colunasZorder != "none":
51 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} com ZORDER no grupo de colunas: {colunasZorder} <<< >>> {str(datetime.now())}")
52 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})")
53 | else:
54 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} sem ZORDER <<< >>> {str(datetime.now())}")
55 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela}")
56 | print(f">>> Tabela {nomeSchema}.{nomeTabela} otimizada! <<< >>> {str(datetime.now())}")
57 | else:
58 | print(f"### Não executado OPTIMIZE! ###")
59 |
60 | if vacuum:
61 | print(f">>> Setando {vacuumRetention} horas para limpeza de versionamento do deltalake... <<< >>> {str(datetime.now())}")
62 | print(f"CMD: VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours")
63 | print(f">>> Limpeza da tabela {nomeSchema}.{nomeTabela} aplicada com sucesso! <<< >>> {str(datetime.now())}")
64 | else:
65 | print(f"### Não executado VACUUM! ###")
66 | else:
67 | print("Modo Debug desabilitado!")
68 | if optimize:
69 | if colunasZorder != "none":
70 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} com ZORDER no grupo de colunas: {colunasZorder} <<< >>> {str(datetime.now())}")
71 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})")
72 | spark.sql(f"OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})")
73 | else:
74 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} sem ZORDER <<< >>> {str(datetime.now())}")
75 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela}")
76 | spark.sql(f"OPTIMIZE {nomeSchema}.{nomeTabela}")
77 | print(f">>> Tabela {nomeSchema}.{nomeTabela} otimizada! <<< >>> {str(datetime.now())}")
78 | else:
79 | print(f"### Não executado OPTIMIZE! ###")
80 |
81 | if vacuum:
82 | print(f">>> Setando {vacuumRetention} horas para limpeza de versionamento do deltalake... <<< >>> {str(datetime.now())}")
83 | spark.sql("set spark.databricks.delta.retentionDurationCheck.enabled = false")
84 | print(f"CMD: VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours")
85 | spark.sql(f"VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours")
86 | spark.sql("set spark.databricks.delta.retentionDurationCheck.enabled = true")
87 | print(f">>> Limpeza da tabela {nomeSchema}.{nomeTabela} aplicada com sucesso! <<< >>> {str(datetime.now())}")
88 | else:
89 | print(f"### Não executado VACUUM! ###")
90 |
91 | # COMMAND ----------
92 |
93 | # DBTITLE 1,Enviar parâmetros para execução após instanciar a função
94 | # MAGIC %py
95 | # MAGIC #Caso queira já chamar a função diretamente do Azure Data Factory, informar os parametros na chamada do notebook
96 | # MAGIC try:
97 | # MAGIC maintenanceDeltalake(nomeSchema=getArgument("NomeSchema"), nomeTabela=getArgument("NomeTabela"), colunasZorder=getArgument("ColunasZorder"), vacuumRetention=getArgument("VacuumRetention"), vacuum=eval(getArgument("Vacuum")), optimize=eval(getArgument("Optimize")), debug=eval(getArgument("Debug")))
98 | # MAGIC except:
99 | # MAGIC print("Função maintenanceDeltalake() instanciada no contexto!")
100 |
--------------------------------------------------------------------------------
/tips/timeTravelVsCDF/Time Travel vs Change Data Feed.sql:
--------------------------------------------------------------------------------
1 | -- Databricks notebook source
2 | -- DBTITLE 1,Criando tabela demo
3 | -- MAGIC %py
4 | -- MAGIC df = spark.read.option("header", "True").format('csv').load('/databricks-datasets/COVID/coronavirusdataset/PatientInfo.csv')
5 | -- MAGIC df.write.format('delta').mode('overwrite').saveAsTable("db_demo.PatientInfoDelta",path='abfss://reginaldo@stdts360.dfs.core.windows.net/bronze/PatientInfoDelta')
6 | -- MAGIC df.display()
7 |
8 | -- COMMAND ----------
9 |
10 | -- DBTITLE 1,Time travel exemplo
11 | -- Atualizando 1 registro
12 | update db_demo.PatientInfoDelta set age = '33s' where patient_id = '1000000001'
13 |
14 | -- COMMAND ----------
15 |
16 | -- DBTITLE 1,Visualizando o histórico de alterações na tabela
17 | describe history db_demo.PatientInfoDelta
18 |
19 | -- COMMAND ----------
20 |
21 | -- DBTITLE 1,Viajando no tempo usando VERSION AS OF
22 | select * from db_demo.PatientInfoDelta VERSION AS OF 0 where patient_id = '1000000001'
23 |
24 | -- COMMAND ----------
25 |
26 | -- DBTITLE 1,Viajando no tempo usando TIMESTAMP AS OF
27 | select 'OLD', * from db_demo.PatientInfoDelta timestamp AS OF '2023-06-03T14:19:07.000+0000'
28 | where patient_id = '1000000001' union all
29 | select 'NEW', * from db_demo.PatientInfoDelta where patient_id = '1000000001'
30 |
31 | -- COMMAND ----------
32 |
33 | -- DBTITLE 1,DELETE Sem Where - E agora quem poderá nos defender?
34 | delete from db_demo.PatientInfoDelta;
35 | select * from db_demo.PatientInfoDelta
36 |
37 | -- COMMAND ----------
38 |
39 | -- DBTITLE 1,Historico de alterações
40 | describe history db_demo.PatientInfoDelta
41 |
42 | -- COMMAND ----------
43 |
44 | -- DBTITLE 1,Restaurando a tabela com historico do TIME TRAVEL
45 | RESTORE db_demo.PatientInfoDelta VERSION AS OF 1;
46 | select * from db_demo.PatientInfoDelta
47 |
48 | -- COMMAND ----------
49 |
50 | -- DBTITLE 1,Habilitando o Change Data Feed
51 | Alter table db_demo.PatientInfoDelta SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
52 |
53 | -- COMMAND ----------
54 |
55 | -- DBTITLE 1,Criar nossa tabela Silver para simular o CDF na prática
56 | -- Essa silver só terá dados de pacientes infectados por outros pacientes filtrando pelo infected_by
57 | Create or Replace table db_demo.SilverPatientInfectedBy
58 | as
59 | select patient_id,sex,age,country,province,city,infection_case,infected_by from db_demo.PatientInfoDelta where infected_by is not null;
60 |
61 | select * from db_demo.SilverPatientInfectedBy;
62 |
63 | -- COMMAND ----------
64 |
65 | -- DBTITLE 1,Gerando algumas modificações
66 | -- Atualizando 2 registro, deletando 1 registro e inserindo 1 registro
67 | -- Note que estou aplicando 2 updates no mesmo registro 1000000003
68 | update db_demo.PatientInfoDelta set age = '70s' where patient_id = '1000000003';
69 | update db_demo.PatientInfoDelta set sex = 'female' where patient_id = '1000000003';
70 | delete from db_demo.PatientInfoDelta where patient_id = '1000000005';
71 | insert into db_demo.PatientInfoDelta values('1000003211','male','31s','Brazil','Sao Paulo','Boituva','Dataholic','1500000033',12,current_date(),current_date(),current_date(),null,'released');
72 |
73 | -- COMMAND ----------
74 |
75 | -- DBTITLE 1,Visualizando as versões
76 | describe history db_demo.PatientInfoDelta
77 |
78 | -- COMMAND ----------
79 |
80 | -- DBTITLE 1,Usando table_changes() para navegar nas versões
81 | -- Pegando a partir da versão 4 tudo que aconteceu
82 | SELECT _change_type,_commit_version,_commit_timestamp,* FROM table_changes('db_demo.`PatientInfoDelta`', 4)
83 |
84 | -- COMMAND ----------
85 |
86 | -- DBTITLE 1,Criando uma View temporaria para pegar somente a ultima versão de cada registro
87 | -- Para Updates pegamos somente o update_postimage que são os dados novos
88 | -- Estamos aplicando a função ROW_NUMBER pela chave da tabela (patient_id) ordenando pelo _commit_version
89 | -- Note que o rnb filtramos apenas o 1, então se o paciente tiver 2 Updates será aplicado o mais recente
90 | -- Estou passando a versão fixa no table_changes, mas pode ser dinamico
91 | CREATE OR REPLACE TEMPORARY VIEW vwPatientInfectedBy as
92 | SELECT *
93 | FROM
94 | (SELECT *, row_number() over (partition by patient_id order by _commit_version desc) as rnb
95 | FROM table_changes('db_demo.`PatientInfoDelta`', 4) where _change_type !='update_preimage' and infected_by is not null)
96 | WHERE rnb=1;
97 |
98 | select _change_type,_commit_version,_commit_timestamp,rnb,* from vwPatientInfectedBy;
99 |
100 | -- COMMAND ----------
101 |
102 | -- DBTITLE 1,Visualizando alterações antes
103 | select * from db_demo.SilverPatientInfectedBy where patient_id in('1000000003','1000000005','1000003211');
104 |
105 | -- COMMAND ----------
106 |
107 | -- DBTITLE 1,Aplicando as alterações na nossa tabela Silver
108 | MERGE INTO db_demo.SilverPatientInfectedBy as t
109 | USING vwPatientInfectedBy as s
110 | ON s.patient_id = t.patient_id
111 | WHEN MATCHED AND s._change_type = 'delete' THEN DELETE
112 | WHEN MATCHED AND s._change_type = 'update_postimage' THEN UPDATE SET *
113 | WHEN NOT MATCHED AND _change_type != 'delete' THEN INSERT *
114 |
115 | -- COMMAND ----------
116 |
117 | -- DBTITLE 1,Visualizando alterações depois
118 | select * from db_demo.SilverPatientInfectedBy where patient_id in('1000000003','1000000005','1000003211');
119 |
120 | -- COMMAND ----------
121 |
122 | -- DBTITLE 1,Sem CDC daria pra fazer?
123 | -- Somente para INSERT e UPDATE, Delete não é replicavel, a não ser que voce compare as tabelas inteiras, que na maioria dos casos não é viável, pois são cargas incrementais
124 | -- Nome a quantidade de escrita, praticamente a tabela Silver inteira foi reescrita
125 | MERGE INTO db_demo.SilverPatientInfectedBy as t
126 | USING db_demo.PatientInfoDelta as s
127 | ON s.patient_id = t.patient_id and s.infected_by is not null
128 | WHEN MATCHED
129 | THEN UPDATE SET *
130 | WHEN NOT MATCHED and s.infected_by is not null
131 | THEN INSERT *
132 |
133 | -- COMMAND ----------
134 |
135 | -- DBTITLE 1,Restaurar um DELETE\UPDATE sem WHERE com CDF?
136 | UPDATE db_demo.PatientInfoDelta set age = '10s';
137 | select * from db_demo.PatientInfoDelta
138 |
139 | -- COMMAND ----------
140 |
141 | describe history db_demo.PatientInfoDelta
142 |
143 | -- COMMAND ----------
144 |
145 | -- DBTITLE 1,Olhando somente a versão 9
146 | SELECT _change_type,_commit_version,_commit_timestamp,* FROM table_changes('db_demo.`PatientInfoDelta`', 9,9)
147 | where _change_type = 'update_preimage'
148 |
149 | -- COMMAND ----------
150 |
151 | -- DBTITLE 1,Voltando um Update sem WHERE com CDF
152 | -- Voltando todos os UPDATES da versão 9
153 | MERGE INTO db_demo.PatientInfoDelta as t
154 | USING (SELECT row_number() over (partition by patient_id order by _commit_version desc) as rnb,*
155 | FROM table_changes('db_demo.`PatientInfoDelta`', 9,9) where _change_type = 'update_preimage') as s
156 | ON s.patient_id = t.patient_id and _change_type = 'update_preimage' and rnb = 1
157 | WHEN MATCHED
158 | THEN UPDATE SET *
159 |
160 | -- COMMAND ----------
161 |
162 | -- DBTITLE 1,É, funciona também
163 | select * from db_demo.PatientInfoDelta;
164 |
--------------------------------------------------------------------------------
/tips/markdown/OptimizeAndVacuum_Doc.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Exemplo de markdown com imagem
3 | # MAGIC %md
4 | # MAGIC | **Coluna1** | **Coluna2** | **Coluna3** |
5 | # MAGIC | --------------- | ------------------| ----------- |
6 | # MAGIC | `linha1` | Desc1. | `str` |
7 | # MAGIC | `linha2` | Desc2. | `str` |
8 | # MAGIC | `linha3` | Desc3. | `str` |
9 | # MAGIC
10 | # MAGIC 
11 |
12 | # COMMAND ----------
13 |
14 | # MAGIC %md
15 | # MAGIC # `maintenanceDeltalake`
16 | # MAGIC
17 | # MAGIC A função `maintenanceDeltalake` é utilizada para executar operações de manutenção em uma tabela Delta Lake. Ela oferece a opção de otimizar a tabela e executar a limpeza de versionamento.
18 | # MAGIC
19 | # MAGIC ## Parâmetros
20 | # MAGIC
21 | # MAGIC | **Nome** | **Descrição** | **Tipo** |
22 | # MAGIC | ------------------- | ------------------------------------------------------------------------------------- | ----------- |
23 | # MAGIC | `nomeSchema` | O nome do esquema (schema) da tabela Delta Lake. | `str` |
24 | # MAGIC | `nomeTabela` | O nome da tabela Delta Lake. | `str` |
25 | # MAGIC | `colunasZorder` | O grupo de colunas para aplicar o ZORDER. | `str` |
26 | # MAGIC | `vacuumRetention` | O tempo de retenção em horas para a limpeza de versionamento. | `int` |
27 | # MAGIC | `vacuum` | Indica se a limpeza de versionamento deve ser executada. | `bool` |
28 | # MAGIC | `optimize` | Indica se a otimização da tabela deve ser executada. | `bool` |
29 | # MAGIC | `debug` | Indica se o modo de depuração está habilitado. | `bool` |
30 | # MAGIC
31 | # MAGIC ## Exemplo de Uso
32 | # MAGIC
33 | # MAGIC Aqui estão três exemplos de chamadas da função `maintenanceDeltalake` com diferentes parâmetros:
34 | # MAGIC
35 | # MAGIC 1. Exemplo com otimização e limpeza habilitadas:
36 | # MAGIC ```python
37 | # MAGIC maintenanceDeltalake(nomeSchema='silver', nomeTabela='my_table', colunasZorder='column1, column2', vacuumRetention=72, vacuum=True, optimize=True, debug=True)
38 | # MAGIC ```
39 | # MAGIC
2. Exemplo sem otimização e com limpeza desabilitada:
40 | # MAGIC ```python
41 | # MAGIC maintenanceDeltalake(nomeSchema='silver', nomeTabela='my_table', colunasZorder='none', vacuumRetention=168, vacuum=False, optimize=False, debug=True)
42 | # MAGIC ```
43 | # MAGIC
3. Exemplo sem otimização e limpeza habilitadas, em modo de produção:
44 | # MAGIC ```python
45 | # MAGIC maintenanceDeltalake(nomeSchema='silver', nomeTabela='my_table', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=False, debug=False)
46 | # MAGIC ```
47 | # MAGIC
48 | # MAGIC >Observação: Lembre-se de fornecer os valores corretos para os parâmetros, com base nas suas necessidades específicas.
49 | # MAGIC
50 | # MAGIC Referência
51 | # MAGIC Para obter mais informações sobre como otimizar seu Delta Lake e reduzir os custos de storage e computação no Databricks, confira o seguinte post: Otimize seu Delta Lake e reduza custos de storage, Databricks e computação.
52 |
53 | # COMMAND ----------
54 |
55 | from datetime import datetime
56 | def maintenanceDeltalake (nomeSchema='silver', nomeTabela='none', colunasZorder='none', vacuumRetention=168, vacuum=True, optimize=True, debug=True):
57 | if debug:
58 | print("Modo Debug habilitado!")
59 | if optimize:
60 | if colunasZorder != "none":
61 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} com ZORDER no grupo de colunas: {colunasZorder} <<< >>> {str(datetime.now())}")
62 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})")
63 | else:
64 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} sem ZORDER <<< >>> {str(datetime.now())}")
65 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela}")
66 | print(f">>> Tabela {nomeSchema}.{nomeTabela} otimizada! <<< >>> {str(datetime.now())}")
67 | else:
68 | print(f"### Não executado OPTIMIZE! ###")
69 |
70 | if vacuum:
71 | print(f">>> Setando {vacuumRetention} horas para limpeza de versionamento do deltalake... <<< >>> {str(datetime.now())}")
72 | print(f"CMD: VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours")
73 | print(f">>> Limpeza da tabela {nomeSchema}.{nomeTabela} aplicada com sucesso! <<< >>> {str(datetime.now())}")
74 | else:
75 | print(f"### Não executado VACUUM! ###")
76 | else:
77 | print("Modo Debug desabilitado!")
78 | if optimize:
79 | if colunasZorder != "none":
80 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} com ZORDER no grupo de colunas: {colunasZorder} <<< >>> {str(datetime.now())}")
81 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})")
82 | spark.sql(f"OPTIMIZE {nomeSchema}.{nomeTabela} ZORDER BY ({colunasZorder})")
83 | else:
84 | print(f">>> Otimizando tabela {nomeSchema}.{nomeTabela} sem ZORDER <<< >>> {str(datetime.now())}")
85 | print(f"CMD: OPTIMIZE {nomeSchema}.{nomeTabela}")
86 | spark.sql(f"OPTIMIZE {nomeSchema}.{nomeTabela}")
87 | print(f">>> Tabela {nomeSchema}.{nomeTabela} otimizada! <<< >>> {str(datetime.now())}")
88 | else:
89 | print(f"### Não executado OPTIMIZE! ###")
90 |
91 | if vacuum:
92 | print(f">>> Setando {vacuumRetention} horas para limpeza de versionamento do deltalake... <<< >>> {str(datetime.now())}")
93 | spark.sql("set spark.databricks.delta.retentionDurationCheck.enabled = false")
94 | print(f"CMD: VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours")
95 | spark.sql(f"VACUUM {nomeSchema}.{nomeTabela} RETAIN {vacuumRetention} Hours")
96 | spark.sql("set spark.databricks.delta.retentionDurationCheck.enabled = true")
97 | print(f">>> Limpeza da tabela {nomeSchema}.{nomeTabela} aplicada com sucesso! <<< >>> {str(datetime.now())}")
98 | else:
99 | print(f"### Não executado VACUUM! ###")
100 |
101 | # COMMAND ----------
102 |
103 | # MAGIC %md
104 | # MAGIC Enviar parâmetros para execução após instanciar a função
105 |
106 | # COMMAND ----------
107 |
108 | # DBTITLE 1,Enviar parâmetros para execução após instanciar a função
109 | # MAGIC %py
110 | # MAGIC #Caso queira já chamar a função diretamente do Azure Data Factory, informar os parametros na chamada do notebook
111 | # MAGIC try:
112 | # MAGIC maintenanceDeltalake(nomeSchema=getArgument("NomeSchema"), nomeTabela=getArgument("NomeTabela"), colunasZorder=getArgument("ColunasZorder"), vacuumRetention=getArgument("VacuumRetention"), vacuum=eval(getArgument("Vacuum")), optimize=eval(getArgument("Optimize")), debug=eval(getArgument("Debug")))
113 | # MAGIC except:
114 | # MAGIC print("Função maintenanceDeltalake() instanciada no contexto!")
115 |
--------------------------------------------------------------------------------
/tips/EXPLODE_STRING/Explode usando SQL.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Listando Grupos de logins no Databricks via API
3 | import requests
4 | import json
5 |
6 | # Define a URL base da API do Databricks
7 | instance_id = 'adb-47319640954053.13.azuredatabricks.net'
8 |
9 | api_version = '/api/2.0'
10 | api_command = '/preview/scim/v2/Groups'
11 | url_list = f"https://{instance_id}{api_version}{api_command}"
12 | url_list_members = f"https://{instance_id}/api/2.0/preview/scim/v2/Groups/"
13 |
14 | print(url_list)
15 |
16 | # Define o cabeçalho com o token de autenticação do Databricks
17 | headers = {
18 | 'Authorization': "Bearer xxx-3",
19 | "Content-Type": "application/json"
20 | }
21 |
22 | has_more = True
23 | count = 0
24 | offset = 0
25 | jsonGroups = []
26 | while has_more:
27 | params = {
28 | 'expand_tasks': 'true',
29 | 'offset': offset
30 | }
31 | try:
32 | print('Listando grupos')
33 | responseList = requests.get(
34 | url = url_list,
35 | params = params,
36 | headers= headers
37 | )
38 | except Exception as error:
39 | print(error)
40 |
41 | jsonGroups.append(responseList.json())
42 | try:
43 | has_more = json.loads(responseList.text)['has_more']
44 | except:
45 | has_more = False
46 |
47 | count = count + 1
48 | offset = offset + 20
49 |
50 | print(jsonGroups)
51 |
52 | # COMMAND ----------
53 |
54 | # DBTITLE 1,Transformando a lista em Dataframe
55 | jsonRDD = sc.parallelize(jsonGroups)
56 | df = spark.read.option('multiline', 'true').option('inferSchema', 'true').json(jsonRDD)
57 | df.createOrReplaceTempView('vw_sql_temp')
58 |
59 | # COMMAND ----------
60 |
61 | # DBTITLE 1,Criando o mesmo Dataframe usando tudo como String
62 | jsonValues = []
63 | jsonValues.append({'totalResults': 7, 'startIndex': 1, 'itemsPerPage': 7, 'schemas': ["urn:ietf:params:scim:api:messages: 2.0:ListResponse"], 'Resources': "[{'displayName': 'read_write_prod', 'entitlements': null, 'groups': [{'$ref': 'Groups/769409655224333', 'display': 'read_dev', 'type': 'direct', 'value': '769409655224333'}], 'id': '67674397758141', 'members': [{'$ref': 'Users/8675301566931963', 'display': 'reginaldo.silva@dataside.com.br', 'value': '8675301566931963'}, {'$ref': 'ServicePrincipals/2955041608089028', 'display': 'databricks-serviceprincipal', 'value': '2955041608089028'}, {'$ref': 'Users/547673826594172', 'display': 'Reginaldo Silva', 'value': '547673826594172'}, {'$ref': 'Users/1581289963735043', 'display': 'Regis Naldo', 'value': '1581289963735043'}, {'$ref': 'Users/1948297106640748', 'display': 'Reginaldo Silva', 'value': '1948297106640748'}], 'meta': {'resourceType': 'Group'}}, {'displayName': 'read_prod', 'entitlements': null, 'groups': [{'$ref': 'Groups/766964445608499', 'display': 'read_write_dev', 'type': 'direct', 'value': '766964445608499'}], 'id': '138152945819756', 'members': [{'$ref': 'Users/8675301566931963', 'display': 'reginaldo.silva@dataside.com.br', 'value': '8675301566931963'}], 'meta': {'resourceType': 'Group'}}, {'displayName': 'users', 'entitlements': [{'value': 'workspace-access'}, {'value': 'databricks-sql-access'}], 'groups': [], 'id': '371637887295750', 'members': [{'$ref': 'ServicePrincipals/2955041608089028', 'display': 'databricks-serviceprincipal', 'value': '2955041608089028'}, {'$ref': 'Users/8675301566931963', 'display': 'reginaldo.silva@dataside.com.br', 'value': '8675301566931963'}, {'$ref': 'Users/547673826594172', 'display': 'Reginaldo Silva', 'value': '547673826594172'}, {'$ref': 'Users/1581289963735043', 'display': 'Regis Naldo', 'value': '1581289963735043'}, {'$ref': 'Users/1948297106640748', 'display': 'Reginaldo Silva', 'value': '1948297106640748'}], 'meta': {'resourceType': 'WorkspaceGroup'}}, {'displayName': 'read_write_dev', 'entitlements': [{'value': 'databricks-sql-access'}], 'groups': [], 'id': '766964445608499', 'members': [{'$ref': 'Users/8675301566931963', 'display': 'reginaldo.silva@dataside.com.br', 'value': '8675301566931963'}, {'$ref': 'Users/1948297106640748', 'display': 'Reginaldo Silva', 'value': '1948297106640748'}, {'$ref': 'Groups/138152945819756', 'display': 'read_prod', 'value': '138152945819756'}], 'meta': {'resourceType': 'Group'}}, {'displayName': 'read_dev', 'entitlements': null, 'groups': [], 'id': '769409655224333', 'members': [{'$ref': 'Groups/67674397758141', 'display': 'read_write_prod', 'value': '67674397758141'}], 'meta': {'resourceType': 'Group'}}, {'displayName': 'admins', 'entitlements': [{'value': 'workspace-access'}, {'value': 'databricks-sql-access'}, {'value': 'allow-cluster-create'}, {'value': 'allow-instance-pool-create'}], 'groups': [], 'id': '868174163364744', 'members': [{'$ref': 'Users/547673826594172', 'display': 'Reginaldo Silva', 'value': '547673826594172'}, {'$ref': 'Users/1948297106640748', 'display': 'Reginaldo Silva', 'value': '1948297106640748'}], 'meta': {'resourceType': 'WorkspaceGroup'}}, {'displayName': 'demogroup', 'entitlements': null, 'groups': [], 'id': '1053327257318900', 'members': [{'$ref': 'Users/547673826594172', 'display': 'Reginaldo Silva', 'value': '547673826594172'}, {'$ref': 'Users/1581289963735043', 'display': 'Regis Naldo', 'value': '1581289963735043'}, {'$ref': 'Users/8675301566931963', 'display': 'reginaldo.silva@dataside.com.br', 'value': '8675301566931963'}, {'$ref': 'Users/1948297106640748', 'display': 'Reginaldo Silva', 'value': '1948297106640748'}], 'meta': {'resourceType': 'Group'}}]" })
64 |
65 | from pyspark.sql.types import *
66 | schema = StructType([
67 | StructField('Resources', StringType(), True),
68 | StructField('itemsPerPage', StringType(), True),
69 | StructField('schemas', StringType(), True),
70 | StructField('startIndex', StringType(), True),
71 | StructField('totalResults', StringType(), True)
72 | ])
73 |
74 | dfString = spark.createDataFrame(jsonValues,schema)
75 | dfString.createOrReplaceTempView('vw_sql_temp_string')
76 | dfString.printSchema()
77 |
78 | # COMMAND ----------
79 |
80 | # DBTITLE 1,Dataframe tipado Array e Struct
81 | # MAGIC %sql
82 | # MAGIC select * from vw_sql_temp
83 |
84 | # COMMAND ----------
85 |
86 | # DBTITLE 1,Dataframe usando String
87 | # MAGIC %sql
88 | # MAGIC select * from vw_sql_temp_string
89 |
90 | # COMMAND ----------
91 |
92 | # MAGIC %sql
93 | # MAGIC -- Acessando um item especifico do Array
94 | # MAGIC select Resources[3] from vw_sql_temp
95 |
96 | # COMMAND ----------
97 |
98 | # DBTITLE 1,Acessando um campo Array\Struct
99 | # MAGIC %sql
100 | # MAGIC -- Voce pode navegar de forma bem simples usando ponto
101 | # MAGIC select Resources[3].displayName from vw_sql_temp
102 |
103 | # COMMAND ----------
104 |
105 | # DBTITLE 1,Explodindo e acessando niveis de forma simples
106 | # MAGIC %sql
107 | # MAGIC -- Listando todos os usuários e grupos a nivel de linha
108 | # MAGIC -- Note que estamos acessando os campos apenas com . apos aplocar o Explode da coluna Resources
109 | # MAGIC select Resources.displayName,explode(Resources.members.display) from(
110 | # MAGIC select explode(Resources) as Resources from vw_sql_temp
111 | # MAGIC ) nivel1
112 |
113 | # COMMAND ----------
114 |
115 | # MAGIC %sql
116 | # MAGIC select from_json(Resources,'a string'),* from vw_sql_temp
117 |
118 | # COMMAND ----------
119 |
120 | # DBTITLE 1,Acessando campos do JSON no formato string
121 | # MAGIC %sql
122 | # MAGIC -- Ops, não é tão simples assim
123 | # MAGIC select Resources.displayName,* from vw_sql_temp_string
124 |
125 | # COMMAND ----------
126 |
127 | # DBTITLE 1,FROM_JSON
128 | # MAGIC %sql
129 | # MAGIC select from_json(Resources,"ARRAY>") as Resources
130 | # MAGIC from vw_sql_temp_string
131 |
132 | # COMMAND ----------
133 |
134 | # MAGIC %sql
135 | # MAGIC select from_json(Resources,"ARRAY>, groups: ARRAY>, id: STRING, members: ARRAY>, meta: STRUCT>>") as Resources
136 | # MAGIC from vw_sql_temp_string
137 |
138 | # COMMAND ----------
139 |
140 | # MAGIC %sql
141 | # MAGIC select Resources.displayName,explode(Resources.members.display) from(
142 | # MAGIC select explode(from_json(Resources,"ARRAY>, groups: ARRAY>, id: STRING, members: ARRAY>, meta: STRUCT>>")) as Resources from vw_sql_temp_string
143 | # MAGIC ) nivel1
144 |
--------------------------------------------------------------------------------
/routines/tablesSize&Vacuum/README.md:
--------------------------------------------------------------------------------
1 | ## Para usar os scripts desse Repos, basta importar para sua pasta no Databricks.
2 |
3 | Selecionar o opção Import
4 |
5 |
6 | Selecionar o script e importar:
7 |
8 |
9 |
10 | | Versão | data | Descrição |
11 | |-----------|-------|----------|
12 | | `v1.0` | 2022-12-01 | Executando em clientes internos |
13 | | `v1.1` | 2023-02-25 | Liberado para mais alguns clientes e engenheiros|
14 | | `v2.0` | 2023-04-24 | Liberado publicamente |
15 |
16 | Link do post: https://www.datainaction.dev/post/databricks-tablessize-vacuum-monitore-e-reduza-custos-do-seu-delta-lake
17 |
18 | Funcionalidade e objetivo
19 |
20 | > Esse notebook tem como principal objetivo coletar informações de tamanho e realizar limpeza das tabelas no formato Delta.
21 | >
22 | > São coletadas informações do tamanho da tabela no **Storage** e cruzadas com o tamanho da **versão atual**, assim podemos estimar quanto de espaço a operação de Vacuum poderia liberar.
23 | >
24 | > **Nota**: Focado para ambientes sem Unity Catalog ainda, embora funcione, o script será adaptado para ambientes com Unity Catalog
25 | >
26 | > **Nota 2**: As primeiras execuções do Vacuum podem demorar mais se seu ambiente nunca passou por essa manutenção, as execuções posteriores serão mais rápidas, pois, menos tabelas precisarão de vacuum, isso conforme o parâmetro vacuumThreshold.
27 | >
28 | > **Nota 3**: Reforçando, a primeira execução rode com o parâmetro **runVacuum = False**, apenas para você avaliar e ter uma noção de como está seu ambiente e quanto tempo a rotina irá levar.
29 | >
30 | > **Nota 4**: Se você sofrer algum erro, me envie pelo Github, LinkeDin ou por e-mail e posso te ajudar: reginaldo.silva27@gmail.com
31 | >
32 | > **Nota 5**: Pode aumentar o custo do seu Storage em relação às transações devido às chamadas do DButils.fs.ls, por isso, use com cautela, monitore e rode no máximo 1 vez por semana.
33 | >
34 | > **Nota 6**: Testes realizados com Azure (Usando o caminho absoluto e Mount), AWS (Mount) e GCP (Mount) para gravar as tabelas de controle.
35 | >
36 | > Abaixo são os passos executados em orderm de execução:
37 | 1. Listagem de todas as tabelas existentes para determinado database ou grupo de databases, usamos um SHOW DATABASES E SHOW TABLES
38 | 2. Executado um **describe detail** para cada tabela e armazenado o resultado em uma tabela Delta para análise e monitoramento
39 | 3. É executado uma varredura (dbutils.fs.ls) nas pastas do Storage recursivamente para calcular o espaço ocupado no Storage por cada tabela, excluindo os _delta_logs
40 | 4. Executado queries de análise para avaliar quais tabelas podem se beneficiar do Vacuum
41 | 5. Executado a operação de Vacuum nas tabelas que atingem o threshold definido
42 |
43 | **Recomendação de Cluster:**
44 | > Comece com um cluster pequeno e monitore, cluster inicial: **Driver: Standard_DS4_v2 · 2 x Workers: Standard_DS3_v2 · Runtime >11.3**
45 |
46 | **Observações:**
47 | - **Existem outras formas de realizar essa operação, no entanto, essa foi a menos complexa e com melhor performance com exceção da operação Vacuum DRY RUN em Scala**
48 | - **A primeira versão desenvolvida utilizei a leitura dos Delta Logs, Jsons e checkpoints, contudo, não consegui reproduzir exatamente a operação de Vacuum DRY RUN e a performance ficou pior devido a quantidade de validações que precisei adicionar**
49 | - Nessa versão mapiei todos os arquivos marcados como Remove no log, embora, seja mais performático a precisão não era (dados não batiam) por alguns fatores, para contornar esses fatores o script ficou mais lento
50 | - Tentei reproduzir via Scala, contudo, meu conhecimento em Scala é limitado e ficou muito complexo
51 | - Falei com alguns engenheiros da comunidade Delta, mas não tive sucesso em evoluir via Scala
52 | - **Se você rodar o Vaccum Dry Run via Scala ele printa o retorno, contudo, esse retorno vai para o Stdout e ficou muito complexo de recuperar**
53 | ``%scala
54 | vacuum tablename dry run``
55 | - **Estou avaliando uma nova versão com delta.rs**
56 | - referencia:
57 |
58 | Caso você queira se aventurar com Scala, aqui está o código-fonte:
59 |
60 |
61 | Se você conseguir levantar essa quantidade de espaço de forma mais performática, me atualiza via comentário ou no Github.
62 |
63 | **Ponto de atenção:**
64 | - **Para tabelas particionadas com muitas partições o tempo de execução pode ser mais demorado, por isso monitore as primeiras execuções com cautela, o uso é de sua responsabilidade, apesar de não ter nenhum risco mapeado até o momento, apenas pode gerar mais transações para sua storage**
65 | - **Custo das transações do Storage no Azure: Read operations (per 10,000) - R$0.0258 (Dois centavos por 10 mil operações) (Preço estimado em 21/04/2023)**
66 |
67 |
Descriçao dos parametros
68 |
69 | ### Parametros de controle
70 |
71 | | Parametro | Valor | Descrição |
72 | |-----------|-------|----------|
73 | | `numThreadsParallel` | **15** | Número de threads paralelas, avalie o melhor valor para o seu ambiente, faça testes |
74 | | `vacuumThreadsParallel` | **5** | Número de threads paralelas para execução do **Vacuum**, utilize um valor menor, pois, pode dar problema no cluster, avalie o melhor valor para o seu ambiente, faça testes |
75 | | `runVacuum` | **False** | Se definido como **True** executa o Vacuum com os parâmetros configurados, o valor padrão é **False** execute a primeira vez no default para ter uma noção de como está o seu ambiente |
76 | | `vacuumHours` | **168** | Quantidade de horas que será mantido de versões após o Vacuum, defina o melhor para o seu ambiente, o padrão é 7 dias |
77 | | `vacuumThreshold` | **5x** | Executar o Vacuum apenas nas tabelas em que o storage for **5x** maior do que a versão atual, exemplo: **Uma tabela que tem 100GB de dados na versão atual e possuir 500GB no Storage, irá entrar na rotina de limpeza** |
78 | | `enableLogs` | **False** | Se definido como **True** irá gerar logs para facilitar algumas análises como, por exemplo, o tempo de duração para cada tabela e o erro se houver, contudo,nível eleva bastante o tempo de processamento se você tiver muitas tabelas |
79 | | `enableHistory` | **False** | Se definido como **True** mantém histórico de todas as execuções da rotina e cada execução possuirá um identificador único para poder relacionar entre as tabelas, se definido como **False** (valor padrão) as tabelas de logs (tableDetails e tableStorageFiles) sempre serão truncadas |
80 |
81 | ### Parametros para definição dos metadados
82 |
83 | | Parametro | Valor | Descrição |
84 | |-----------|-------|----------|
85 | | `databaseTarget` | 'bronze*' | Define quais bancos de dados serão analisados, aceita regex com **, exemplo: todos os databases que começam com a palavra bronze: 'bronze*' |
86 | | `tablesTarget` | '*' | Definir quais tabelas serão analisadas, aceita regex com **, por padrão todas serão analisadas |
87 | | `databaseCatalog` | 'db_controle' | Define em qual database será armazenado os logs, caso não exista será criado um novo |
88 | | `tableCatalog` | 'tbCatalog' | Define nome da tabela de controle para armazenar as tabelas que serão analisadas, caso não exista a tabela será criada |
89 | | `tbVacuumSummary` | 'tbVacuumSummary' | Define nome da tabela para armazenar o resultado agregado da execução, caso não exista a tabela será criada |
90 | | `tablesSizeMonitor` | 'tablesSizeMonitor' | Define nome da tabela para armazenar o resultado agregado da execução com detalhes no nível de tabela, caso não exista a tabela será criada |
91 | | `tableDetails` | 'bdaTablesDetails' | Define nome da tabela que irá armazenar o resultado do describe detail, caso não exista a tabela será criada |
92 | | `tableStorageFiles` | 'bdaTablesStorageSize' | Define nome da tabela que irá armazenar o resultado do dbutils.fs.ls |
93 | | `storageLocation` | 'abfss://[container]@[Storage].dfs.core.windows.net/pastaraiz/' [**Exemplo no Azure**]| Define endereço de storage principal, pode ser usado o valor absoluto ou um Mount (dbfs:/mnt/bronze/pastaRaiz/) |
94 | | `tableCatalogLocation` | f'database=db_controle/table_name={tableCatalog}' | Define storage da tabela de catálogo |
95 | | `tablesdetailsLocation` | f'database=db_controle/table_name={tableDetails}' | Define storage da tabela de detalhes do describe |
96 | | `tableStorageFilesLocation` | f'database=db_controle/table_name={tableStorageFiles}' | Define storage da tabela de resultado do dbutils |
97 | | `writeMode` | "overwrite" | Modo de escrita, "append" se `enableHistory` é verdadeiro, caso contrário "overwrite" |
98 | | `identifier` | str(hash(datetime.today())) | Identificador unico para cada execução, usado para vincular as tabelas com suas devidas execuções |
99 |
100 | ## Objetos criados: 1 database e 5x tabelas
101 |
102 | > 1x Database nomeado através da variáve databaseCatalog, por padrão o nome será **db_controle**
103 | > 1x Tabela de catalogo, irá armazenar a listagem, por padrão o nome será **tbCatalog**, se o parâmetro enableHistory estiver desabilitado ela será sobrescrita em cada execução
104 | > 1x Tabela para armazenar o resultado do describe detail, por padrão será chamada de **bdaTablesDetails**, se o parâmetro enableHistory estiver desabilitado ela será sobrescrita em cada execução
105 | > 1x Tabela para armazenar o resultado do List files, por padrão será chamada de **tableStorageFiles**, se o parâmetro enableHistory estiver desabilitado ela será sobrescrita em cada execução
106 | > 1x Tabela para armazenar o resultado agregado da execução com detalhes no nivel de tabela, por padrão será chamada de **tablesSizeMonitor**, essa tabela nunca é truncada
107 | > 1x Tabela para armazenar o resultado agregado da execução, por padrão será chamada de **tbVacuumSummary**, essa tabela nunca é truncada
108 |
109 | ## Monitoramento
110 |
111 | > Monitore seu ambiente através das tabelas tbVacuumSummary e tablesSizeMonitor
112 | > A tabela **tbVacuumSummary** armazena 1 linha por execução de dados sumarizados
113 | > A tabela **tablesSizeMonitor** armazena 1 linha por tabela por execução com dados sumarizados
114 |
115 |
116 | ## Benchmarch:
117 |
118 | > Ambiente com mais de 3 mil tabelas - 200 TB de Storage - 12 horas - Cluster (1xnode DS5) - 50 Threads para analysis e 10 para vacuum - **Sem logs (enableLogs=False)** - Primeira execução
119 | > Ambiente com 300 tabelas - 5 TB de Storage - 1 hora - Cluster (1xnode DS3) - 25 Threads para analysis e 10 para vacuum - **Sem logs (enableLogs=False)** - Primeira execução
120 | > Ambiente com 300 tabelas - 5 TB de Storage - 2 horas - Cluster (1xnode DS3) - 25 Threads para analysis e 10 para vacuum - **Com logs (enableLogs=True)** - Primeira execução
121 | > Ambiente com 1000 tabelas - 10 GB de Storage - **6 horas - Cluster (1xnode DS3)** - 25 Threads para analysis e 10 para vacuum - **Com logs (enableLogs=True)** - Primeira execução
122 | Ambiente com 1000 tabelas - 10 GB de Storage - 6 horas - Cluster (1xnode DS3) - 25 Threads para analysis e 10 para vacuum - **Sem Logs (enableLogs=False)** - Primeira execução
123 |
124 | ## Cases reais:
125 |
126 | > **Case 1 - Azure:** Liberado mais de 250 TB na primeira execução em um ambiente que não havia rotina
127 | > **Case 2 - GCP:** Liberado 5 TB de logs em um ambiente pequeno, onde o total de dados eram apenas 50 GB e o storage tinha 5 TB
128 | > **Case 3 - Azure:** Liberado em média 10 TB de logs por semana, utilizando em um Databricks Job com agendamento para todos os finais de semana
129 |
130 | ## Implementações futuras:
131 |
132 | > 1. Utilizar Unity Catalog
133 | > 2. Converter código para uma Lib em python
134 | > 3. Refatorar código usando a Lib e melhorando a usabilidade
135 | > 4. Minimizar custos com dbutils.fs.ls, olhando direto para o transction log
136 |
137 | ## Referencias:
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 | > ``Criado por: Reginaldo Silva``
146 | - [Blog Data In Action](https://datainaction.dev/)
147 | - [Github](https://github.com/reginaldosilva27)
148 |
--------------------------------------------------------------------------------
/tips/sparkconfs/Spark Confs.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # DBTITLE 1,Mostrar todas as configurações disponíveis
3 | df = spark.sparkContext.getConf().getAll()
4 | i=1
5 | for d in df:
6 | print(str(i),' - ',d)
7 | i = i+1
8 |
9 | # COMMAND ----------
10 |
11 | # DBTITLE 1,Mostrar todas as ClusterTags
12 | df = spark.sparkContext.getConf().getAll()
13 | i=1
14 | for d in df:
15 | if 'clusterUsageTags' in d[0]:
16 | print(str(i),' - ',d)
17 | i=i+1
18 |
19 | # COMMAND ----------
20 |
21 | # DBTITLE 1,Cluster tags mais comuns
22 | print(
23 | ' | Description | Value | Description |\n',
24 | ' ----------------------------------------------------------------------------------------------------------------------------------------------------\n',
25 | '| spark.databricks.clusterUsageTags.cloudProvider | ',spark.conf.get('spark.databricks.clusterUsageTags.cloudProvider'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.cloudProvider'))) * ' ','| Cloud que esta operando o Databricks', '|\n',
26 | '| spark.databricks.clusterUsageTags.azureSubscriptionId | ',spark.conf.get('spark.databricks.clusterUsageTags.azureSubscriptionId'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.azureSubscriptionId'))) * ' ','| ID da assinatura do Azure', '|\n',
27 | '| spark.databricks.clusterUsageTags.managedResourceGroup | ',spark.conf.get('spark.databricks.clusterUsageTags.managedResourceGroup'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.managedResourceGroup'))) * ' ','| Grupo de recursos no Azure que é gerenciado pelo Databricks', '|\n',
28 | '| spark.databricks.clusterUsageTags.clusterId | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterId'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterId'))) * ' ','| ID do cluster', '|\n',
29 | '| spark.databricks.clusterUsageTags.region | ',spark.conf.get('spark.databricks.clusterUsageTags.region'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.region'))) * ' ','| Região que hospeda os Clusters', '|\n',
30 | '| spark.databricks.clusterUsageTags.workerEnvironmentId | ',spark.conf.get('spark.databricks.clusterUsageTags.workerEnvironmentId'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.workerEnvironmentId'))) * ' ','| ID do Worksapce', '|\n',
31 | '| spark.databricks.clusterUsageTags.region | ',spark.conf.get('spark.databricks.clusterUsageTags.region'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.region'))) * ' ','| Região que hospeda os Clusters', '|\n',
32 | '| spark.databricks.clusterUsageTags.clusterLogDestination | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterLogDestination'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterLogDestination'))) * ' ','| Caminho onde os logs serão entregues', '|\n',
33 | '| spark.databricks.clusterUsageTags.isSingleUserCluster | ',spark.conf.get('spark.databricks.clusterUsageTags.isSingleUserCluster'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.isSingleUserCluster'))) * ' ','| É um cluster de usuario unico?', '|\n',
34 | '| spark.databricks.clusterUsageTags.clusterName | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterName'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterName'))) * ' ','| Nome do Cluster', '|\n',
35 | '| spark.databricks.clusterUsageTags.clusterScalingType | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterScalingType'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterScalingType'))) * ' ','| Tem auto scale?', '|\n',
36 | '| spark.databricks.clusterUsageTags.clusterNodeType | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterNodeType'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterNodeType'))) * ' ','| Familia da maquina para os nodes', '|\n',
37 | '| spark.databricks.clusterUsageTags.driverNodeType | ',spark.conf.get('spark.databricks.clusterUsageTags.driverNodeType'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.driverNodeType'))) * ' ','| Familia da maquina para o Driver', '|\n',
38 | '| spark.databricks.clusterUsageTags.clusterWorkers | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterWorkers'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterWorkers'))) * ' ','| Quantidade de workers Online', '|\n',
39 | '| spark.databricks.clusterUsageTags.effectiveSparkVersion | ',spark.conf.get('spark.databricks.clusterUsageTags.effectiveSparkVersion'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.effectiveSparkVersion'))) * ' ','| Versão do Spark operando no Cluster', '|\n',
40 | '| spark.databricks.clusterUsageTags.clusterSku | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterSku'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterSku'))) * ' ','| Tipo do Cluster', '|\n',
41 | '| spark.databricks.clusterUsageTags.clusterAvailability | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterAvailability'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterAvailability'))) * ' ','| Tipo de VMs em uso, SPOT ou On Demand', '|\n',
42 | '| spark.databricks.clusterUsageTags.enableElasticDisk | ',spark.conf.get('spark.databricks.clusterUsageTags.enableElasticDisk'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.enableElasticDisk'))) * ' ','| Possui discos elasticos para escalar?', '|\n',
43 | '| spark.databricks.clusterUsageTags.autoTerminationMinutes | ',spark.conf.get('spark.databricks.clusterUsageTags.autoTerminationMinutes'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.autoTerminationMinutes'))) * ' ','| Desligar cluster automaticamente após X minutos', '|\n',
44 | '| spark.databricks.clusterUsageTags.runtimeEngine | ',spark.conf.get('spark.databricks.clusterUsageTags.runtimeEngine'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.runtimeEngine'))) * ' ','| Tipo da Engine em execução', '|\n',
45 | '| spark.databricks.clusterUsageTags.clusterLastActivityTime | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterLastActivityTime'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterLastActivityTime'))) * ' ','| Data da ultima atividade executada no cluster', '|\n',
46 | '| spark.databricks.clusterUsageTags.enableCredentialPassthrough | ',spark.conf.get('spark.databricks.clusterUsageTags.enableCredentialPassthrough'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.enableCredentialPassthrough'))) * ' ','| Cluster com Passthrough habilitado?', '|\n',
47 | '| spark.databricks.clusterUsageTags.instanceWorkerEnvNetworkType| ',spark.conf.get('spark.databricks.clusterUsageTags.instanceWorkerEnvNetworkType'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.instanceWorkerEnvNetworkType'))) * ' ','|(vnet-injection) ou Default?', '|\n',
48 | '| spark.databricks.clusterUsageTags.enableLocalDiskEncryption | ',spark.conf.get('spark.databricks.clusterUsageTags.enableLocalDiskEncryption'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.enableLocalDiskEncryption'))) * ' ','| Criptografica local?', '|\n',
49 | '| park.databricks.clusterUsageTags.clusterOwnerOrgId | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterOwnerOrgId'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterOwnerOrgId'))) * ' ','| ID da organização, faz parte da URL do Workspace', '|\n',
50 | '| spark.databricks.clusterUsageTags.clusterPythonVersion | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterPythonVersion'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterPythonVersion'))) * ' ','| Versão do Python rodando no Cluster', '|\n',
51 | '| spark.databricks.clusterUsageTags.enableDfAcls | ',spark.conf.get('spark.databricks.clusterUsageTags.enableDfAcls'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.enableDfAcls'))) * ' ','| Possui ACL habilitado?', '|\n',
52 | '| spark.databricks.clusterUsageTags.instanceWorkerEnvId | ',spark.conf.get('spark.databricks.clusterUsageTags.instanceWorkerEnvId'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.instanceWorkerEnvId'))) * ' ','| ID da Instancia', '|\n',
53 | '| spark.databricks.clusterUsageTags.clusterUnityCatalogMode | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterUnityCatalogMode'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterUnityCatalogMode'))) * ' ','| Utiliza Unity Catalog?', '|\n',
54 | '| spark.databricks.clusterUsageTags.enableSqlAclsOnly | ',spark.conf.get('spark.databricks.clusterUsageTags.enableSqlAclsOnly'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.enableSqlAclsOnly'))) * ' ','| ACL com SQL habilitado?', '|\n',
55 | '| spark.databricks.clusterUsageTags.clusterPinned | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterPinned'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterPinned'))) * ' ','| Cluster esta pinado?', '|\n',
56 | '| spark.databricks.clusterUsageTags.privateLinkEnabled | ',spark.conf.get('spark.databricks.clusterUsageTags.privateLinkEnabled'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.privateLinkEnabled'))) * ' ','| Possui Private Link habilitado?', '|\n',
57 | '| spark.databricks.clusterUsageTags.clusterCreator | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterCreator'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterCreator'))) * ' ','| Cluster criador por', '|\n',
58 | '| spark.databricks.clusterUsageTags.clusterNumCustomTags | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterNumCustomTags'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterNumCustomTags'))) * ' ','| Quantidade de tags customizadas', '|\n',
59 | '| spark.databricks.clusterUsageTags.clusterAllTags | ',spark.conf.get('spark.databricks.clusterUsageTags.clusterAllTags'),(40-len(spark.conf.get('spark.databricks.clusterUsageTags.clusterAllTags'))) * ' ','| Quantidade de tags customizadas', '|\n',
60 | ' ----------------------------------------------------------------------------------------------------------------------------------------------------\n',
61 | 'Links Reference:\n',
62 | 'https://spark.apache.org/docs/latest/configuration.html \n',
63 | 'https://books.japila.pl/delta-lake-internals/'
64 | )
65 |
66 | # COMMAND ----------
67 |
68 | # DBTITLE 1,Spark confs comuns
69 | print(
70 | ' | Description | Value | Description |\n',
71 | ' ----------------------------------------------------------------------------------------------------------------------------------------------------\n',
72 | '| spark.databricks.cloudProvider | ',spark.conf.get('spark.databricks.cloudProvider'),(40-len(spark.conf.get('spark.databricks.cloudProvider'))) * ' ','| Cloud que esta operando o Databricks', '|\n',
73 | '| spark.databricks.workspaceUrl | ',spark.conf.get('spark.databricks.workspaceUrl'),(40-len(spark.conf.get('spark.databricks.workspaceUrl'))) * ' ','| URL para acessar o Worksapce', '|\n',
74 | '| spark.app.startTime | ',spark.conf.get('spark.app.startTime'),(40-len(spark.conf.get('spark.app.startTime'))) * ' ','| Hora de inicio da aplicação Spark', '|\n',
75 | '| spark.app.name | ',spark.conf.get('spark.app.name'),(40-len(spark.conf.get('spark.app.name'))) * ' ','| Nome da aplicação Spark', '|\n',
76 | '| spark.app.id | ',spark.conf.get('spark.app.id'),(40-len(spark.conf.get('spark.app.id'))) * ' ','| Id da aplicação criada pelo Spark ', '|\n',
77 | '| spark.databricks.clusterSource | ',spark.conf.get('spark.databricks.clusterSource'),(40-len(spark.conf.get('spark.databricks.clusterSource'))) * ' ','| Cluster criado via UI, JOB or API', '|\n',
78 | '| spark.driver.maxResultSize | ',spark.conf.get('spark.driver.maxResultSize'),(40-len(spark.conf.get('spark.driver.maxResultSize'))) * ' ','| Tamanho máximo do retorno de uma ação, exemplo Collect(), caso contrario será abortado para evitar problemas no Driver', '|\n',
79 | '| spark.sql.sources.default | ',spark.conf.get('spark.sql.sources.default'),(40-len(spark.conf.get('spark.sql.sources.default'))) * ' ','| Padrão de fonte utilizada, no Spark puro o default é Parquet', '|\n',
80 | '| spark.databricks.delta.multiClusterWrites.enabled | ',spark.conf.get('spark.databricks.delta.multiClusterWrites.enabled'),(40-len(spark.conf.get('spark.databricks.delta.multiClusterWrites.enabled'))) * ' ','| Permite escrita por mais de um cluster paralelo', '|\n',
81 | '| spark.databricks.workerNodeTypeId | ',spark.conf.get('spark.databricks.workerNodeTypeId'),(40-len(spark.conf.get('spark.databricks.workerNodeTypeId'))) * ' ','| Familia da VM utilizada nos Workers do Cluster', '|\n',
82 | '| spark.driver.host | ',spark.conf.get('spark.driver.host'),(40-len(spark.conf.get('spark.driver.host'))) * ' ','| IP da VM do Driver', '|\n',
83 | '| spark.master | ',spark.conf.get('spark.master'),(40-len(spark.conf.get('spark.master'))) * ' ','| Gerenciador do Cluster', '|\n',
84 | '| spark.databricks.driverNodeTypeId | ',spark.conf.get('spark.databricks.driverNodeTypeId'),(40-len(spark.conf.get('spark.databricks.driverNodeTypeId'))) * ' ','| Familia da VM utilizada no Driver do Cluster', '|\n',
85 | '| spark.executor.memory | ',spark.conf.get('spark.executor.memory'),(40-len(spark.conf.get('spark.executor.memory'))) * ' ','| Quantidade de memoria RAM nos Workers', '|\n',
86 | '| spark.sql.hive.metastore.version | ',spark.conf.get('spark.sql.hive.metastore.version'),(40-len(spark.conf.get('spark.sql.hive.metastore.version'))) * ' ','| Versão do Metastore', '|\n',
87 | '| spark.databricks.automl.serviceEnabled | ',spark.conf.get('spark.databricks.automl.serviceEnabled'),(40-len(spark.conf.get('spark.databricks.automl.serviceEnabled'))) * ' ','| Validar se o serviço de ML esta habilitado', '|\n',
88 | ' ----------------------------------------------------------------------------------------------------------------------------------------------------\n',
89 | 'Links Reference:\n',
90 | 'https://spark.apache.org/docs/latest/configuration.html \n',
91 | 'https://books.japila.pl/delta-lake-internals/'
92 | )
93 |
94 | # COMMAND ----------
95 |
96 | # DBTITLE 1,Environment Variables
97 | import os
98 | i=1
99 | for var in os.environ.items():
100 | print(str(i),' - ',var)
101 | i = i+1
102 |
--------------------------------------------------------------------------------
/tips/DatabricksAcademy/Learning.md:
--------------------------------------------------------------------------------
1 | # Mastering Databricks - Tudo que voce precisa saber sobre Databricks
2 |
3 | # Table of Contents
4 | 1. [Introdução](#Introdução)
5 | 2. [Fundamentals](#Fundamentals)
6 | 3. [Learning Paths](#Learning-Paths)
7 | 4. [Cursos Complementares](#Cursos-Complementares)
8 | 5. [Accreditation e Badges](#Accreditation-e-Badges)
9 | 6. [Certificações oficiais](#Certificações-oficiais)
10 | 7. [Cursos Legendados](#Cursos-Legendados)
11 | 8. [Ordem recomendada de estudo - Data Engineer](#Ordem-recomendada-de-estudo-para-Data-Engineer)
12 | 9. [Ordem recomendada para Certificação](#Ordem-recomendada-para-Certificação)
13 |
14 |
15 | ## Introdução
16 | Aqui temos um compilado de cursos dentro da plataforma **Databricks Academy** que você pode aproveitar 100% FREE (0800), **são mais de 50 mil reais em cursos**.
17 |
18 | Não está listado todos os cursos da plataforma, mas sim, os que considero essenciais.
19 |
20 | ### **ATENÇÃO**:
21 | Existem **dois portais** de estudo:
22 |
**Partner**: Aqui estão liberados praticamente todos os cursos e materias FREE para empresas que são Partners, você precisa acessar com o e-mail da sua empresa, para saber se sua empresa é Parter ou como se tornar um, entre em contato com um Account Manager Databricks
23 |
**Customer**: Para quem não é Parter temos um conteúdo um pouco maislimitado, mas quase todos os essenciais são FREE, **APROVEITE**
24 |
**OBS: Deixarei uma TAG na frente dos links destacando de qual Portal é o curso: [PARTNER], [CUSTOMER]**
25 |
26 |
Para descobrir se sua empresa é Partner, busque ela nesse portal, se ela for, sinta-se presenteado, são mais de **50 mil** reais em cursos totalmente FREE.
27 |
https://www.databricks.com/company/partners
28 |
29 |
30 |
31 | #### Acessando o Portal:
32 | https://www.databricks.com/br/learn/training/login
33 |
34 |
35 |
36 | ## Fundamentals
37 |
38 | > Aqui são os cursos introdutórios sobre Databricks e Generative AI, são cursos focados nos fundamentos, essencial que você não pule essa etapda.
39 |
40 | - What is Big Data?: **[PARTNER]**
41 | > https://partner-academy.databricks.com/learn/course/internal/view/elearning/100/what-is-big-data
42 |
43 | - Databricks Fundamentals Learning Plan: **[PARTNER]**
44 | > https://partner-academy.databricks.com/learn/learning_plan/view/215/databricks-fundamentals-learning-plan
45 |
46 | - Databricks Fundamentals Learning Plan: **[CUSTOMER]**
47 | > https://customer-academy.databricks.com/learn/learning_plan/view/215/databricks-fundamentals-learning-plan
48 |
49 | - Databricks Generative AI Fundamentals Learning Plan: **[PARTNER]**
50 | > https://partner-academy.databricks.com/learn/lp/275/Databricks%2520Generative%2520AI%2520Fundamentals%2520Learning%2520Plan
51 |
52 | - Databricks Generative AI Fundamentals Learning Plan: **[CUSTOMER]**
53 | > https://customer-academy.databricks.com/learn/learning_plan/view/275/databricks-generative-ai-fundamentals-learning-plan
54 |
55 | ## Learning Paths
56 |
57 | - Apache Spark Developer Learning Plan: **[PARTNER]**
58 | > https://partner-academy.databricks.com/learn/lp/160/Apache%2520Spark%2520Developer%2520Learning%2520Plan
59 |
60 | - Apache Spark Developer Learning Plan: **[CUSTOMER]**
61 | > https://customer-academy.databricks.com/learn/learning_plan/view/160/apache-spark-developer-learning-plan
62 |
63 | - Data Engineer Learning Plan: **[PARTNER]**
64 | > https://partner-academy.databricks.com/learn/lp/10/Data%2520Engineer%2520Learning%2520Plan
65 |
66 | - Data Engineer Learning Plan: **[CUSTOMER]**
67 | > https://customer-academy.databricks.com/learn/learning_plan/view/10/data-engineer-learning-plan
68 |
69 | - Data Analyst Learning Plan: **[PARTNER]**
70 | > https://partner-academy.databricks.com/learn/lp/78/Data%2520Analyst%2520Learning%2520Plan
71 |
72 | - Data Analyst Learning Plan: **[CUSTOMER]**
73 | > https://customer-academy.databricks.com/learn/learning_plan/view/78/data-analyst-learning-plan
74 |
75 | - Generative AI Engineering Pathway: **[PARTNER]**
76 | > https://partner-academy.databricks.com/learn/learning_plan/view/315/generative-ai-engineering-pathway
77 |
78 | - Generative AI Engineering Pathway: **[CUSTOMER]**
79 | > https://customer-academy.databricks.com/learn/learning_plan/view/315/generative-ai-engineering-pathway
80 |
81 | - Machine Learning Practitioner Learning Plan: **[PARTNER]**
82 | > https://partner-academy.databricks.com/learn/learning_plan/view/11/machine-learning-practitioner-learning-plan
83 |
84 | - Machine Learning Practitioner Learning Plan: **[CUSTOMER]**
85 | > https://customer-academy.databricks.com/learn/learning_plan/view/11/machine-learning-practitioner-learning-plan
86 |
87 | - Azure Databricks Platform Architect Learning Plan: **[PARTNER]**
88 | > https://partner-academy.databricks.com/learn/lp/254/Azure%2520Databricks%2520Platform%2520Architect%2520Learning%2520Plan
89 |
90 | - Azure Databricks Platform Architect Learning Plan: **[CUSTOMER]**
91 | > https://customer-academy.databricks.com/learn/learning_plan/view/254/azure-databricks-platform-architect-learning-plan
92 |
93 | - AWS Databricks Platform Architect Learning Plan: **[PARTNER]**
94 | > https://partner-academy.databricks.com/learn/lp/230/AWS%2520Databricks%2520Platform%2520Architect%2520Learning%2520Plan
95 |
96 | - AWS Databricks Platform Architect Learning Plan: **[CUSTOMER]**
97 | > https://customer-academy.databricks.com/learn/learning_plan/view/230/aws-databricks-platform-architect-learning-plan
98 |
99 | - GCP Databricks Platform Architect Learning Plan: **[PARTNER]**
100 | > https://partner-academy.databricks.com/learn/lp/266/GCP%2520Databricks%2520Platform%2520Architect%2520Learning%2520Plan
101 |
102 | - GCP Databricks Platform Architect Learning Plan: **[CUSTOMER]**
103 | > https://customer-academy.databricks.com/learn/learning_plan/view/266/gcp-databricks-platform-architect-learning-plan
104 |
105 | - Platform Administrator Learning Plan: **[PARTNER]**
106 | > https://partner-academy.databricks.com/learn/lp/207/Platform%2520Administrator%2520Learning%2520Plan
107 |
108 | - Platform Administrator Learning Plan: **[CUSTOMER]**
109 | > https://customer-academy.databricks.com/learn/learning_plan/view/207/platform-administrator-learning-plan
110 |
111 | ## Cursos Complementares
112 |
113 | - Databricks Specialist Sessions (Muito conteúdo top): **[PARTNER]**
114 | > https://partner-academy.databricks.com/learn/course/1456/Databricks%2520Specialist%2520Sessions
115 |
116 | - Databricks Specialist Sessions (Muito conteúdo top): **[CUSTOMER]**
117 | > https://customer-academy.databricks.com/learn/course/internal/view/elearning/1456/databricks-specialist-sessions
118 |
119 | - Advanced Data Engineering with Databricks (Focado para certificação Professional): **[PARTNER]**
120 | > https://partner-academy.databricks.com/learn/course/2268/Advanced%2520Data%2520Engineering%2520with%2520Databricks
121 |
122 | - Advanced Data Engineering with Databricks (Focado para certificação Professional): **[CUSTOMER]**
123 | > https://customer-academy.databricks.com/learn/course/internal/view/elearning/2268/advanced-data-engineering-with-databricks
124 |
125 | - Unity Catalog Essentials: **[PARTNER]**
126 | > https://partner-academy.databricks.com/learn/learning_plan/view/211/unity-catalog-essentials
127 |
128 | - Unity Catalog Essentials: **[CUSTOMER]**
129 | > https://customer-academy.databricks.com/learn/learning_plan/view/211/unity-catalog-essentials
130 |
131 | - Preparing for UC Upgrades: **[PARTNER]**
132 | > https://partner-academy.databricks.com/learn/learning_plan/view/292/preparing-for-uc-upgrades
133 |
134 | - FY 24 Tech Summit: Partner Content: **[PARTNER]**
135 | > https://partner-academy.databricks.com/learn/learning_plan/view/294/fy-24-tech-summit-partner-content
136 |
137 | - Escape to the Lakehouse: Data Engineering Edition: **[PARTNER]**
138 | > https://partner-academy.databricks.com/learn/course/internal/view/elearning/1979/escape-to-the-lakehouse-data-engineering-edition
139 |
140 | - Escape to the Lakehouse: Data Warehousing Edition: **[PARTNER]**
141 | > https://partner-academy.databricks.com/learn/course/internal/view/elearning/1978/escape-to-the-lakehouse-data-warehousing-edition
142 |
143 | - Databricks Partner Essentials: **[PARTNER]**
144 | > https://partner-academy.databricks.com/learn/course/1263/Databricks%2520Partner%2520Essentials
145 |
146 |
147 | ## Accreditation e Badges
148 |
149 | - Databricks Fundamentals Accreditation:
150 | > https://partner-academy.databricks.com/learn/course/2308/databricks-fundamentals-accreditation;lp=215
151 |
152 | - Generative AI Fundamentals Accreditation:
153 | > https://partner-academy.databricks.com/learn/course/1811/generative-ai-fundamentals-accreditation;lp=275
154 |
155 |
156 |
157 |
158 | - Azure Databricks Platform Architect Accreditation:
159 | > https://partner-academy.databricks.com/learn/course/1752/azure-databricks-platform-architect-accreditation;lp=254
160 |
161 | - AWS Databricks Platform Architect Accreditation:
162 | > https://partner-academy.databricks.com/learn/lp/230/AWS%2520Databricks%2520Platform%2520Architect%2520Learning%2520Plan
163 |
164 | - GCP Databricks Platform Architect Accreditation:
165 | > https://partner-academy.databricks.com/learn/course/1756/gcp-databricks-platform-architect-accreditation;lp=266
166 |
167 |
168 |
169 |
170 | - Databricks Accredited Platform Administrator Accreditation:
171 | > https://partner-academy.databricks.com/learn/course/1229/databricks-accredited-platform-administrator-accreditation;lp=207
172 |
173 |
174 | ## Certificações oficiais
175 |
176 | - Databricks Certified Data Analyst Associate:
177 | > https://www.databricks.com/learn/certification/data-analyst-associate
178 |
179 |
180 |
181 |
182 | - Databricks Certified Data Engineer Associate:
183 | > https://www.databricks.com/learn/certification/data-engineer-associate
184 |
185 | - Databricks Certified Data Engineer Professional:
186 | > https://www.databricks.com/learn/certification/data-engineer-professional
187 |
188 |
189 |
190 |
191 |
192 | - Databricks Certified Machine Learning Associate:
193 | > https://www.databricks.com/learn/certification/machine-learning-associate
194 |
195 | - Databricks Certified Machine Learning Professional:
196 | > https://www.databricks.com/learn/certification/machine-learning-professional
197 |
198 |
199 |
200 |
201 |
202 | - Databricks Certified Associate Developer for Apache Spark:
203 | > https://www.databricks.com/learn/certification/apache-spark-developer-associate
204 |
205 | - Databricks Certified Hadoop Migration Architect:
206 | > https://www.databricks.com/learn/certification/hadoop-migration-architect
207 |
208 |
209 |
210 | ## Cursos Legendados
211 | ### Nota: São cursos com legenda em português.
212 |
213 | - Databricks Fundamentals Learning Plan - Portuguese BR: **[PARTNER]**
214 | > https://partner-academy.databricks.com/learn/learning_plan/view/317/plano-de-aprendizado-dos-fundamentos-da-databricks-databricks-fundamentals-learning-plan-portuguese-br
215 |
216 | - Get Started with Databricks for Data Engineering - Portuguese BR: **[PARTNER]**
217 | > https://partner-academy.databricks.com/learn/course/internal/view/elearning/2331/get-started-with-databricks-for-data-engineering-portuguese-br
218 |
219 | - Get Started with Databricks for Data Engineering - Portuguese BR: **[CUSTOMER]**
220 | > https://customer-academy.databricks.com/learn/course/internal/view/elearning/2331/get-started-with-databricks-for-data-engineering-portuguese-br
221 |
222 | - Data Engineering with Databricks - Portuguese BR: **[PARTNER]**
223 | > https://partner-academy.databricks.com/learn/course/2263/play/16173
224 |
225 | - Data Engineering with Databricks - Portuguese BR: **[CUSTOMER]**
226 | > https://customer-academy.databricks.com/learn/course/internal/view/elearning/2263/data-engineering-with-databricks-portuguese-br
227 |
228 | - Databricks Generative AI Fundamentals Learning Plan - Portuguese BR: **[PARTNER]**
229 | > https://partner-academy.databricks.com/learn/learning_plan/view/314/plano-de-aprendizado-fundamentais-de-ia-generativa-da-databricks-databricks-generative-ai-fundamentals-learning-plan-portuguese-br
230 |
231 | - Databricks Generative AI Fundamentals Learning Plan - Portuguese BR: **[CUSTOMER]**
232 | > https://customer-academy.databricks.com/learn/learning_plan/view/314/plano-de-aprendizado-fundamentais-de-ia-generativa-da-databricks-databricks-generative-ai-fundamentals-learning-plan-portuguese-br
233 |
234 | ## Ordem recomendada de estudo para Data Engineer
235 |
236 | A ordem abaixo é baseado na MINHA opinião, não existe ordem 100% correta, mas existem algumas ordens mais lógicas.
237 |
238 | 1. What is Big Data?
239 | 2. Databricks Fundamentals Learning Plan
240 | 3. Apache Spark Developer Learning Plan
241 | 4. Data Engineer Learning Plan
242 | 5. Data Analyst Learning Plan
243 | 6. Platform Administrator Learning Plan
244 | 7. [Cloud] Databricks Platform Architect Learning Plan
245 | 8. Advanced Data Engineering with Databricks
246 | 9. Databricks Specialist Sessions
247 | 10. Unity Catalog Essentials
248 |
249 | ## Ordem recomendada para Certificação
250 |
251 | A ordem abaixo é baseado na MINHA opinião, não existe ordem 100% correta, mas existem algumas ordens mais lógicas.
252 |
253 | 1. Databricks Certified Associate Developer for Apache Spark
254 | 2. Databricks Certified Data Engineer Associate
255 | 3. Databricks Certified Data Analyst Associate
256 | 4. Databricks Certified Data Engineer Professional
257 |
258 |
259 |
260 |
261 |
262 |
--------------------------------------------------------------------------------
/tips/UpgradeMethods/UpgradeUC_Examples.sql:
--------------------------------------------------------------------------------
1 | -- Databricks notebook source
2 | -- MAGIC %md
3 | -- MAGIC ##Tabela de migração: Estrategias por tipo de tabela
4 | -- MAGIC
5 | -- MAGIC | Id | Tipo HMS | Location | Tipo UC | Método |
6 | -- MAGIC |----|----------|----------------|--------------------|--------------------------|
7 | -- MAGIC | 1 | Managed | DBFS Root | Managed/External | CTAS / DEEP CLONE |
8 | -- MAGIC | 2 | Managed | DBFS Root | Managed/External | CTAS / DEEP CLONE |
9 | -- MAGIC | 3 | Hive SerDe | DBFS Root | Managed/External | CTAS / DEEP CLONE |
10 | -- MAGIC | 4 | Managed | Mount | External | SYNC com Convert |
11 | -- MAGIC | 5 | Managed | Mount | Managed | CTAS / DEEP CLONE |
12 | -- MAGIC | 6 | External | Mount | External | SYNC |
13 | -- MAGIC | 7 | External | Mount | Managed | CTAS / DEEP CLONE |
14 | -- MAGIC | 8 | Managed | Cloud Storage | External | SYNC com Convert |
15 | -- MAGIC | 9 | Managed | Cloud Storage | Managed | CTAS / DEEP CLONE |
16 | -- MAGIC | 10 | External | Cloud Storage | External | SYNC |
17 | -- MAGIC | 11 | External | Cloud Storage | Managed | CTAS / DEEP CLONE |
18 | -- MAGIC
19 | -- MAGIC ## Observação importante
20 | -- MAGIC - **set spark.databricks.sync.command.enableManagedTable=true;**
21 | -- MAGIC - Ao usar essa opção, você não pode dropar a tabela no HMS, pois, o dados serão excluídos do Storage
22 | -- MAGIC - Caso queira dropar, use o script Scala para trocar ela de Managed para External
23 | -- MAGIC
24 | -- MAGIC ## Tabelas Managed vs External
25 | -- MAGIC
26 | -- MAGIC - **Tabelas Managed**:
27 | -- MAGIC - Dados e metadados são gerenciados pelo Unity Catalog.
28 | -- MAGIC - Os dados são armazenados no local especificado pelo catálogo Unity (tipicamente em armazenamento cloud).
29 | -- MAGIC - A exclusão de uma tabela managed remove também os dados.
30 | -- MAGIC - Se for HMS os dados são removidos imediatamente
31 | -- MAGIC - Se for no UC os dados são mantidos por mais 30 dias
32 | -- MAGIC - Aqui voce pode usar o UNDROP até 7 dias
33 | -- MAGIC
34 | -- MAGIC - **Tabelas External**:
35 | -- MAGIC - Apenas os metadados são gerenciados pelo Unity Catalog, os dados permanecem no armazenamento externo (geralmente em um bucket ou outro recurso cloud).
36 | -- MAGIC - A exclusão de uma tabela external remove apenas os metadados; os dados permanecem no armazenamento original.
37 | -- MAGIC - Permite que os dados sejam compartilhados entre diferentes sistemas ou aplicações.
38 | -- MAGIC
39 | -- MAGIC ### DBFS Root vs Mount vs Cloud Storage
40 | -- MAGIC
41 | -- MAGIC - **DBFS Root**:
42 | -- MAGIC - O sistema de arquivos distribuído do Databricks (Databricks File System).
43 | -- MAGIC - Armazenamento temporário e volátil, com possíveis limitações em operações de longa duração.
44 | -- MAGIC - Os dados ficam fisicamente no storage da Databricks que voce não tem acesso
45 | -- MAGIC
46 | -- MAGIC - **Mount**:
47 | -- MAGIC - Uma forma de acessar o armazenamento externo (como S3, ADLS) no DBFS como se fosse um diretório local.
48 | -- MAGIC - Os dados permanecem no armazenamento externo, mas podem ser acessados dentro de Databricks via caminhos montados.
49 | -- MAGIC
50 | -- MAGIC - **Cloud Storage**:
51 | -- MAGIC - Armazenamento na nuvem (ex: AWS S3, Azure Data Lake, Google Cloud Storage) onde os dados podem ser armazenados e acessados diretamente.
52 | -- MAGIC - Mais flexível para armazenamento de grande volume e soluções a longo prazo.
53 | -- MAGIC
54 | -- MAGIC ### Métodos CTAS, DEEP CLONE e SYNC
55 | -- MAGIC
56 | -- MAGIC - **CTAS (Create Table As Select)**:
57 | -- MAGIC - Método usado para criar uma nova tabela a partir dos resultados de uma consulta SQL.
58 | -- MAGIC - A nova tabela pode ser criada com dados agregados ou filtrados.
59 | -- MAGIC - Exemplo de uso: `CREATE TABLE nova_tabela AS SELECT * FROM tabela_existente WHERE condição`.
60 | -- MAGIC
61 | -- MAGIC - **DEEP CLONE**:
62 | -- MAGIC - Método utilizado para clonar tabelas, incluindo seus dados, metadados e histórico de transações.
63 | -- MAGIC - Utilizado para cópia rápida de tabelas, útil em cenários de backup ou migração.
64 | -- MAGIC - Exemplo: `DEEP CLONE origem DESTINO` cria uma cópia completa da tabela de origem.
65 | -- MAGIC
66 | -- MAGIC - **SYNC**:
67 | -- MAGIC - Sincroniza tabelas external com o Unity Catalog, garantindo que o catálogo reflita as alterações feitas diretamente no armazenamento.
68 | -- MAGIC - Essencial para manter a consistência entre os metadados no Unity Catalog e o armazenamento externo.
69 | -- MAGIC - Útil para cenários onde os dados podem ser alterados por fora do Databricks.
70 | -- MAGIC
71 | -- MAGIC
72 | -- MAGIC Post Databricks:
73 | -- MAGIC https://www.databricks.com/blog/migrating-tables-hive-metastore-unity-catalog-metastore#appendix
74 | -- MAGIC
75 | -- MAGIC Notebook oficial:
76 | -- MAGIC https://notebooks.databricks.com/notebooks/uc-upgrade-scenario-with-examples-for-blog.dbc?_gl=1*1nrxwtq*_gcl_au*OTUxMzE5NDg3LjE2OTM0NjcxNDM.
77 | -- MAGIC
78 |
79 | -- COMMAND ----------
80 |
81 | -- MAGIC %md
82 | -- MAGIC ##Scenario 1: Managed tables on HMS with DBFS Root location
83 |
84 | -- COMMAND ----------
85 |
86 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade;
87 | create database if not exists hive_metastore.hmsdb_upgrade_db;
88 |
89 | -- COMMAND ----------
90 |
91 | desc schema extended hive_metastore.hmsdb_upgrade_db;
92 |
93 | -- COMMAND ----------
94 |
95 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_parquet;
96 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_parquet
97 | using parquet
98 | as
99 | select * from parquet.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.parquet/` limit 100;
100 |
101 | -- COMMAND ----------
102 |
103 | desc extended hive_metastore.hmsdb_upgrade_db.people_parquet;
104 |
105 | -- COMMAND ----------
106 |
107 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade;
108 |
109 | -- COMMAND ----------
110 |
111 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/demo_uc/uc_upgrade_db";
112 |
113 | -- COMMAND ----------
114 |
115 | -- DBTITLE 1,Create UC managed Delta table using CTAS (Preferred)
116 | -- Pq não deep clone? DEEP CLONE é recomendado para tabelas DELTA
117 | drop table if exists demo_uc_demo.uc_upgrade_db.people_delta;
118 | create table if not exists
119 | demo_uc_demo.uc_upgrade_db.people_delta
120 | as
121 | select * from hive_metastore.hmsdb_upgrade_db.people_parquet;
122 |
123 | -- COMMAND ----------
124 |
125 | desc extended demo_uc_demo.uc_upgrade_db.people_delta;
126 |
127 | -- COMMAND ----------
128 |
129 | -- DBTITLE 1,Alternatively Create UC External table (with the same HMS file format) using CTAS
130 | drop table if exists demo_uc_demo.uc_upgrade_db.people_parquet_ext;
131 | create table if not exists demo_uc_demo.uc_upgrade_db.people_parquet_ext
132 | using parquet
133 | location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/demo_uc/uc_upgrade_db/people_parquet_ext"
134 | as
135 | select * from hive_metastore.hmsdb_upgrade_db.people_parquet;
136 |
137 | -- COMMAND ----------
138 |
139 | desc extended demo_uc_demo.uc_upgrade_db.people_parquet_ext;
140 |
141 | -- COMMAND ----------
142 |
143 | -- MAGIC %md
144 | -- MAGIC ##Scenario 2: External tables on HMS with DBFS Root location
145 |
146 | -- COMMAND ----------
147 |
148 | -- MAGIC %md
149 | -- MAGIC ## Scenario 3: HMS Hive SerDe table
150 |
151 | -- COMMAND ----------
152 |
153 | -- MAGIC %md
154 | -- MAGIC ## Scenario 4: Managed table on HMS with mounted file paths to External UC Table
155 |
156 | -- COMMAND ----------
157 |
158 | -- MAGIC %python
159 | -- MAGIC dbutils.fs.mounts()
160 |
161 | -- COMMAND ----------
162 |
163 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade;
164 | create database if not exists hive_metastore.hmsdb_upgrade_db location "dbfs:/mnt/landing/hmsdb_upgrade_db/"
165 |
166 | -- COMMAND ----------
167 |
168 | -- DBTITLE 1,Managed Delta HMS table
169 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_delta;
170 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_delta
171 | as
172 | select * from delta.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.delta` limit 100;
173 |
174 | -- COMMAND ----------
175 |
176 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta;
177 |
178 | -- COMMAND ----------
179 |
180 | select current_version();
181 |
182 | -- COMMAND ----------
183 |
184 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade;
185 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/demo_uc/uc_upgrade_db";
186 |
187 | -- COMMAND ----------
188 |
189 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN;
190 |
191 | -- COMMAND ----------
192 |
193 | set spark.databricks.sync.command.enableManagedTable=true;
194 |
195 | -- COMMAND ----------
196 |
197 | describe extended hive_metastore.hmsdb_upgrade_db.people_delta;
198 |
199 | -- COMMAND ----------
200 |
201 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta;
202 |
203 | -- COMMAND ----------
204 |
205 | desc extended demo_uc_demo.uc_upgrade_db.people_delta;
206 |
207 | -- COMMAND ----------
208 |
209 | select * from demo_uc_demo.uc_upgrade_db.people_delta;
210 |
211 | -- COMMAND ----------
212 |
213 | -- DBTITLE 1,Convert HMS Managed Table to External Table
214 | -- MAGIC %scala
215 | -- MAGIC import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
216 | -- MAGIC import org.apache.spark.sql.catalyst.TableIdentifier
217 | -- MAGIC
218 | -- MAGIC val tableName = "people_delta"
219 | -- MAGIC val dbName = "hmsdb_upgrade_db"
220 | -- MAGIC
221 | -- MAGIC val oldTable: CatalogTable = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName, Some(dbName)))
222 | -- MAGIC val alteredTable: CatalogTable = oldTable.copy(tableType = CatalogTableType.EXTERNAL)
223 | -- MAGIC spark.sessionState.catalog.alterTable(alteredTable)
224 |
225 | -- COMMAND ----------
226 |
227 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta;
228 |
229 | -- COMMAND ----------
230 |
231 | -- Arquivos não serão apagados
232 | drop table hive_metastore.hmsdb_upgrade_db.people_delta;
233 |
234 | -- COMMAND ----------
235 |
236 | select * from demo_uc_demo.uc_upgrade_db.people_delta;
237 |
238 | -- COMMAND ----------
239 |
240 | desc extended demo_uc_demo.uc_upgrade_db.people_delta;
241 |
242 | -- COMMAND ----------
243 |
244 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN;
245 |
246 | -- COMMAND ----------
247 |
248 | -- MAGIC %md
249 | -- MAGIC ## Scenario 5: Managed table on HMS with mounted file paths to Managed UC Table
250 |
251 | -- COMMAND ----------
252 |
253 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade;
254 | create database if not exists hive_metastore.hmsdb_upgrade_db location "dbfs:/mnt/landing/hmsdb_upgrade_db/"
255 |
256 | -- COMMAND ----------
257 |
258 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_delta;
259 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_delta
260 | as
261 | select * from delta.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.delta` limit 100;
262 |
263 | -- COMMAND ----------
264 |
265 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta;
266 |
267 | -- COMMAND ----------
268 |
269 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade;
270 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/demo_uc/uc_upgrade_db/uc_upgrade_schema_2/";
271 |
272 | -- COMMAND ----------
273 |
274 | set spark.databricks.sync.command.enableManagedTable=false;
275 |
276 | -- COMMAND ----------
277 |
278 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN;
279 |
280 | -- COMMAND ----------
281 |
282 | -- OU DEEP CLONE
283 | drop table if exists demo_uc_demo.uc_upgrade_db.people_delta;
284 | create table if not exists demo_uc_demo.uc_upgrade_db.people_delta
285 | as
286 | select * from hive_metastore.hmsdb_upgrade_db.people_delta;
287 |
288 | -- COMMAND ----------
289 |
290 | desc extended demo_uc_demo.uc_upgrade_db.people_delta;
291 |
292 | -- COMMAND ----------
293 |
294 | -- MAGIC %md
295 | -- MAGIC ## Scenario 6: External table on HMS with mounted file paths to External UC Table
296 |
297 | -- COMMAND ----------
298 |
299 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade;
300 | create database if not exists hive_metastore.hmsdb_upgrade_db location "dbfs:/mnt/landing/hmsdb_upgrade_db/"
301 |
302 | -- COMMAND ----------
303 |
304 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_delta;
305 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_delta
306 | location "dbfs:/mnt/landing/hmsdb_upgrade_db/people_delta"
307 | as
308 | select * from delta.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.delta` limit 100;
309 |
310 | -- COMMAND ----------
311 |
312 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta;
313 |
314 | -- COMMAND ----------
315 |
316 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade;
317 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/uc_upgrade_schema_2/";
318 |
319 | -- COMMAND ----------
320 |
321 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN;
322 |
323 | -- COMMAND ----------
324 |
325 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta;
326 |
327 | -- COMMAND ----------
328 |
329 | desc extended demo_uc_demo.uc_upgrade_db.people_delta;
330 |
331 | -- COMMAND ----------
332 |
333 | -- MAGIC %md
334 | -- MAGIC ## Scenario 7: External table on HMS with mounted file paths to Managed UC Table
335 |
336 | -- COMMAND ----------
337 |
338 | -- MAGIC %md
339 | -- MAGIC ## Scenario 8: Managed table on HMS with cloud storage file paths to External UC Table
340 |
341 | -- COMMAND ----------
342 |
343 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade;
344 | create database if not exists hive_metastore.hmsdb_upgrade_db location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/hmsdb_upgrade_db/"
345 |
346 | -- COMMAND ----------
347 |
348 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_delta;
349 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_delta
350 | as
351 | select * from delta.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.delta` limit 100;
352 |
353 | -- COMMAND ----------
354 |
355 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta;
356 |
357 | -- COMMAND ----------
358 |
359 | set spark.databricks.sync.command.enableManagedTable=true;
360 |
361 | -- COMMAND ----------
362 |
363 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade;
364 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/uc_upgrade_schema_10/";
365 |
366 | -- COMMAND ----------
367 |
368 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN;
369 |
370 | -- COMMAND ----------
371 |
372 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta;
373 |
374 | -- COMMAND ----------
375 |
376 | desc extended demo_uc_demo.uc_upgrade_db.people_delta;
377 |
378 | -- COMMAND ----------
379 |
380 | -- MAGIC %scala
381 | -- MAGIC import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
382 | -- MAGIC import org.apache.spark.sql.catalyst.TableIdentifier
383 | -- MAGIC
384 | -- MAGIC val tableName = "people_delta"
385 | -- MAGIC val dbName = "hmsdb_upgrade_db"
386 | -- MAGIC
387 | -- MAGIC val oldTable: CatalogTable = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName, Some(dbName)))
388 | -- MAGIC val alteredTable: CatalogTable = oldTable.copy(tableType = CatalogTableType.EXTERNAL)
389 | -- MAGIC spark.sessionState.catalog.alterTable(alteredTable)
390 |
391 | -- COMMAND ----------
392 |
393 | select * from demo_uc_demo.uc_upgrade_db.people_delta ;
394 |
395 | -- COMMAND ----------
396 |
397 | desc extended demo_uc_demo.uc_upgrade_db.people_delta ;
398 |
399 | -- COMMAND ----------
400 |
401 | -- MAGIC %md
402 | -- MAGIC ## Scenario 9: Managed table on HMS with cloud storage file paths to Managed UC Table
403 |
404 | -- COMMAND ----------
405 |
406 | -- MAGIC %md
407 | -- MAGIC ## Scenario 10: External table on HMS with cloud storage file paths to External UC Table
408 |
409 | -- COMMAND ----------
410 |
411 | drop database if exists hive_metastore.hmsdb_upgrade_db cascade;
412 | create database if not exists hive_metastore.hmsdb_upgrade_db location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/hms/hmsdb_upgrade_db/"
413 |
414 | -- COMMAND ----------
415 |
416 | drop table if exists hive_metastore.hmsdb_upgrade_db.people_delta;
417 | create table if not exists hive_metastore.hmsdb_upgrade_db.people_delta
418 | location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/hms/hmsdb_upgrade_db/people_delta"
419 | as
420 | select * from delta.`dbfs:/databricks-datasets/learning-spark-v2/people/people-10m.delta` limit 100;
421 |
422 | -- COMMAND ----------
423 |
424 | desc extended hive_metastore.hmsdb_upgrade_db.people_delta;
425 |
426 | -- COMMAND ----------
427 |
428 | create catalog if not exists demo_uc_demo
429 |
430 | -- COMMAND ----------
431 |
432 | drop schema if exists demo_uc_demo.uc_upgrade_db cascade;
433 | create schema if not exists demo_uc_demo.uc_upgrade_db managed location "abfss://bronze@datalakedatainactiondev.dfs.core.windows.net/unitycatalog/demo_uc_demo/uc_upgrade_db/";
434 |
435 | -- COMMAND ----------
436 |
437 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta DRY RUN;
438 |
439 | -- COMMAND ----------
440 |
441 | sync table demo_uc_demo.uc_upgrade_db.people_delta from hive_metastore.hmsdb_upgrade_db.people_delta;
442 |
443 | -- COMMAND ----------
444 |
445 | desc extended demo_uc_demo.uc_upgrade_db.people_delta;
446 |
447 | -- COMMAND ----------
448 |
449 | select * from demo_uc_demo.uc_upgrade_db.people_delta;
450 |
451 | -- COMMAND ----------
452 |
453 | -- MAGIC %md
454 | -- MAGIC ## Scenario 11: External table on HMS with cloud storage file paths to Managed UC Table
455 |
--------------------------------------------------------------------------------