├── .gitignore ├── spark ├── requirements │ ├── requirements.in │ └── requirements.txt ├── .env ├── entrypoint.sh ├── spark-defaults.conf ├── spark-defaults-pg-catalog.conf ├── spark-defaults-minio.conf └── Dockerfile ├── .env ├── minio ├── Dockerfile ├── .env ├── .env.backup ├── entrypoint.sh └── README.md ├── .env.backup ├── medium-data ├── db │ └── minio │ │ ├── metadata │ │ ├── 274b9f78-7835-494a-88e7-c4a7fbc87659-m0.avro │ │ ├── b10658da-8308-41c7-9209-217725fa8660-m0.avro │ │ ├── c5934717-68b5-4262-8218-a2de395ba51e-m0.avro │ │ ├── c5934717-68b5-4262-8218-a2de395ba51e-m1.avro │ │ ├── d4223bc8-a7bd-46b3-80f3-7434fb900e74-m0.avro │ │ ├── d4223bc8-a7bd-46b3-80f3-7434fb900e74-m1.avro │ │ ├── snap-2032559078621466157-1-274b9f78-7835-494a-88e7-c4a7fbc87659.avro │ │ ├── snap-739442481904053118-1-c5934717-68b5-4262-8218-a2de395ba51e.avro │ │ ├── snap-7426647800932772370-1-b10658da-8308-41c7-9209-217725fa8660.avro │ │ ├── snap-7825605915503001692-1-d4223bc8-a7bd-46b3-80f3-7434fb900e74.avro │ │ ├── 00000-f5b4c31a-cf6e-4723-a086-0c66c6a102f4.metadata.json │ │ ├── 00001-4c5adbca-0447-491b-999c-50c28051f24d.metadata.json │ │ ├── 00002-9911da1b-4e8c-4216-96f8-df53466e5096.metadata.json │ │ └── 00003-019ca905-74a2-40a8-a8f3-e123372820fa.metadata.json │ │ └── data │ │ ├── 00000-0-9a99aa40-5d08-4d7e-850a-d307d23f5c0f-00001.parquet │ │ ├── 00000-14-ab2a6c1e-23c1-4aa1-8d9a-09a9d7d97238-00001.parquet │ │ ├── 00000-19-b5ae6ac4-9f6e-4b63-a510-a185ecd03fee-00001.parquet │ │ ├── 00001-1-9a99aa40-5d08-4d7e-850a-d307d23f5c0f-00001.parquet │ │ └── 00007-9-012e468c-5234-40d8-8ce4-d7ec420a068f-00001.parquet └── transformed_avro_files │ ├── manifest_lists │ ├── snap-7426647800932772370-1-b10658da-8308-41c7-9209-217725fa8660.json │ ├── snap-2032559078621466157-1-274b9f78-7835-494a-88e7-c4a7fbc87659.json │ ├── snap-7825605915503001692-1-d4223bc8-a7bd-46b3-80f3-7434fb900e74.json │ └── snap-739442481904053118-1-c5934717-68b5-4262-8218-a2de395ba51e.json │ └── manifests │ ├── c5934717-68b5-4262-8218-a2de395ba51e-m0.json │ ├── 274b9f78-7835-494a-88e7-c4a7fbc87659-m0.json │ ├── c5934717-68b5-4262-8218-a2de395ba51e-m1.json │ ├── d4223bc8-a7bd-46b3-80f3-7434fb900e74-m1.json │ ├── d4223bc8-a7bd-46b3-80f3-7434fb900e74-m0.json │ └── b10658da-8308-41c7-9209-217725fa8660-m0.json ├── .pre-commit-config.yaml ├── docker-compose-pg-catalog.yml ├── README.md ├── docker-compose.yml ├── notebooks ├── read-data.ipynb ├── .ipynb_checkpoints │ ├── iceberg-getting-started-checkpoint.ipynb │ ├── getting-started-checkpoint.ipynb │ ├── inspect-catalogs-checkpoint.ipynb │ ├── add-modify-remove-data-checkpoint.ipynb │ ├── postgres-metadata-checkpoint.ipynb │ └── postgres-catalog-checkpoint.ipynb ├── getting-started.ipynb ├── inspect-catalogs.ipynb ├── add-modify-remove-data.ipynb └── postgres-catalog.ipynb ├── docker-compose-minio.yml ├── Makefile └── helper_scripts └── print_avro_contents.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | /warehouse/ 4 | helper_scripts/*.json 5 | -------------------------------------------------------------------------------- /spark/requirements/requirements.in: -------------------------------------------------------------------------------- 1 | jupyter 2 | pyiceberg[pyarrow,duckdb,pandas] 3 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | USER=user 2 | PASSWORD=password 3 | 4 | MINIO_ROOT_USER=user 5 | MINIO_ROOT_PASSWORD=password 6 | -------------------------------------------------------------------------------- /minio/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM minio/mc 2 | 3 | COPY entrypoint.sh . 4 | 5 | RUN chmod +x entrypoint.sh 6 | -------------------------------------------------------------------------------- /.env.backup: -------------------------------------------------------------------------------- 1 | USER=user 2 | PASSWORD=password 3 | 4 | MINIO_ROOT_USER=user 5 | MINIO_ROOT_PASSWORD=password 6 | -------------------------------------------------------------------------------- /minio/.env: -------------------------------------------------------------------------------- 1 | USER=user 2 | PASSWORD=password 3 | 4 | MINIO_ROOT_USER=user 5 | MINIO_ROOT_PASSWORD=password 6 | MINIO_DOMAIN=minio-s3 7 | MINIO_REGION=us-east-1 8 | -------------------------------------------------------------------------------- /minio/.env.backup: -------------------------------------------------------------------------------- 1 | USER=user 2 | PASSWORD=password 3 | 4 | MINIO_ROOT_USER=user 5 | MINIO_ROOT_PASSWORD=password 6 | MINIO_DOMAIN=minio-s3 7 | MINIO_REGION=us-east-1 8 | -------------------------------------------------------------------------------- /spark/.env: -------------------------------------------------------------------------------- 1 | __SPARK_NO_DAEMONIZE=true 2 | 3 | MINIO_USER=user 4 | MINIO_PASSWORD=password 5 | MINIO_REGION=us-east-1 6 | 7 | AWS_ACCESS_KEY_ID=user 8 | AWS_SECRET_ACCESS_KEY=password 9 | AWS_REGION=us-east-1 10 | -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/274b9f78-7835-494a-88e7-c4a7fbc87659-m0.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/metadata/274b9f78-7835-494a-88e7-c4a7fbc87659-m0.avro -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/b10658da-8308-41c7-9209-217725fa8660-m0.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/metadata/b10658da-8308-41c7-9209-217725fa8660-m0.avro -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/c5934717-68b5-4262-8218-a2de395ba51e-m0.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/metadata/c5934717-68b5-4262-8218-a2de395ba51e-m0.avro -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/c5934717-68b5-4262-8218-a2de395ba51e-m1.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/metadata/c5934717-68b5-4262-8218-a2de395ba51e-m1.avro -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/d4223bc8-a7bd-46b3-80f3-7434fb900e74-m0.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/metadata/d4223bc8-a7bd-46b3-80f3-7434fb900e74-m0.avro -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/d4223bc8-a7bd-46b3-80f3-7434fb900e74-m1.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/metadata/d4223bc8-a7bd-46b3-80f3-7434fb900e74-m1.avro -------------------------------------------------------------------------------- /medium-data/db/minio/data/00000-0-9a99aa40-5d08-4d7e-850a-d307d23f5c0f-00001.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/data/00000-0-9a99aa40-5d08-4d7e-850a-d307d23f5c0f-00001.parquet -------------------------------------------------------------------------------- /medium-data/db/minio/data/00000-14-ab2a6c1e-23c1-4aa1-8d9a-09a9d7d97238-00001.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/data/00000-14-ab2a6c1e-23c1-4aa1-8d9a-09a9d7d97238-00001.parquet -------------------------------------------------------------------------------- /medium-data/db/minio/data/00000-19-b5ae6ac4-9f6e-4b63-a510-a185ecd03fee-00001.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/data/00000-19-b5ae6ac4-9f6e-4b63-a510-a185ecd03fee-00001.parquet -------------------------------------------------------------------------------- /medium-data/db/minio/data/00001-1-9a99aa40-5d08-4d7e-850a-d307d23f5c0f-00001.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/data/00001-1-9a99aa40-5d08-4d7e-850a-d307d23f5c0f-00001.parquet -------------------------------------------------------------------------------- /medium-data/db/minio/data/00007-9-012e468c-5234-40d8-8ce4-d7ec420a068f-00001.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/data/00007-9-012e468c-5234-40d8-8ce4-d7ec420a068f-00001.parquet -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/snap-2032559078621466157-1-274b9f78-7835-494a-88e7-c4a7fbc87659.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/metadata/snap-2032559078621466157-1-274b9f78-7835-494a-88e7-c4a7fbc87659.avro -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/snap-739442481904053118-1-c5934717-68b5-4262-8218-a2de395ba51e.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/metadata/snap-739442481904053118-1-c5934717-68b5-4262-8218-a2de395ba51e.avro -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/snap-7426647800932772370-1-b10658da-8308-41c7-9209-217725fa8660.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/metadata/snap-7426647800932772370-1-b10658da-8308-41c7-9209-217725fa8660.avro -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/snap-7825605915503001692-1-d4223bc8-a7bd-46b3-80f3-7434fb900e74.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/apache-iceberg-data-exploration/HEAD/medium-data/db/minio/metadata/snap-7825605915503001692-1-d4223bc8-a7bd-46b3-80f3-7434fb900e74.avro -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - repo: https://github.com/kynan/nbstripout 9 | rev: 0.6.1 10 | hooks: 11 | - id: nbstripout 12 | args: ["--extra-keys", "metadata.pycharm"] 13 | -------------------------------------------------------------------------------- /spark/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPARK_WORKLOAD=$1 4 | 5 | echo "SPARK_WORKLOAD: $SPARK_WORKLOAD" 6 | 7 | if [ "$SPARK_WORKLOAD" == "master" ]; 8 | then 9 | start-master.sh -p 7077 10 | 11 | eval notebook 12 | elif [ "$SPARK_WORKLOAD" == "worker" ]; 13 | then 14 | WORKER_PORT=${2:-8081} 15 | echo "$WORKER_PORT" 16 | 17 | start-worker.sh spark://spark-iceberg:7077 --webui-port "$WORKER_PORT" 18 | elif [ "$SPARK_WORKLOAD" == "history" ] 19 | then 20 | start-history-server.sh 21 | fi 22 | -------------------------------------------------------------------------------- /docker-compose-pg-catalog.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | pg-catalog: 5 | image: postgres:15-alpine 6 | container_name: pg_catalog 7 | environment: 8 | - POSTGRES_USER=iceberg 9 | - POSTGRES_PASSWORD=iceberg 10 | - POSTGRES_DB=iceberg 11 | healthcheck: 12 | test: [ "CMD", "pg_isready", "-U", "iceberg" ] 13 | interval: 5s 14 | retries: 5 15 | ports: 16 | - "5432:5432" 17 | 18 | spark-iceberg: 19 | build: 20 | context: ./spark 21 | args: 22 | SPARK_DEFAULTS_CONF: spark-defaults-pg-catalog.conf 23 | depends_on: 24 | - pg-catalog 25 | -------------------------------------------------------------------------------- /spark/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.master spark://spark-iceberg:7077 2 | spark.eventLog.enabled true 3 | spark.eventLog.dir /opt/spark/spark-events 4 | spark.history.fs.logDirectory /opt/spark/spark-events 5 | spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions 6 | spark.sql.catalog.data org.apache.iceberg.spark.SparkCatalog 7 | spark.sql.catalog.data.type hadoop 8 | spark.sql.catalog.data.warehouse /home/iceberg/warehouse 9 | spark.sql.defaultCatalog data 10 | spark.sql.catalogImplementation in-memory 11 | -------------------------------------------------------------------------------- /medium-data/transformed_avro_files/manifest_lists/snap-7426647800932772370-1-b10658da-8308-41c7-9209-217725fa8660.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "manifest_path": "s3://iceberg-data/db/minio/metadata/b10658da-8308-41c7-9209-217725fa8660-m0.avro", 4 | "manifest_length": 7060, 5 | "partition_spec_id": 0, 6 | "content": 0, 7 | "sequence_number": 1, 8 | "min_sequence_number": 1, 9 | "added_snapshot_id": 7426647800932772370, 10 | "added_data_files_count": 2, 11 | "existing_data_files_count": 0, 12 | "deleted_data_files_count": 0, 13 | "added_rows_count": 5, 14 | "existing_rows_count": 0, 15 | "deleted_rows_count": 0, 16 | "partitions": [] 17 | } 18 | ] -------------------------------------------------------------------------------- /spark/spark-defaults-pg-catalog.conf: -------------------------------------------------------------------------------- 1 | spark.master spark://spark-iceberg:7077 2 | spark.eventLog.enabled true 3 | spark.eventLog.dir /opt/spark/spark-events 4 | spark.history.fs.logDirectory /opt/spark/spark-events 5 | spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions 6 | spark.sql.catalog.data org.apache.iceberg.spark.SparkCatalog 7 | spark.sql.catalog.data.warehouse /home/iceberg/warehouse 8 | spark.sql.catalog.data.catalog-impl org.apache.iceberg.jdbc.JdbcCatalog 9 | spark.sql.catalog.data.uri jdbc:postgresql://pg-catalog:5432/iceberg 10 | spark.sql.catalog.data.jdbc.user iceberg 11 | spark.sql.catalog.data.jdbc.password iceberg 12 | spark.sql.defaultCatalog data 13 | spark.sql.catalogImplementation in-memory 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # About the repo 2 | 3 | I've wanted to learn about Apache Iceberg for some time. This repo is a result of that. 4 | It's also a code repository for the Medium articles written about the process of learning. 5 | 6 | # Medium articles published 7 | 1. [Learning Apache Iceberg - an introspection](https://medium.com/@MarinAgli1/learning-apache-iceberg-an-introspection-f479ee8c7461) 8 | 2. [Learning Apache Iceberg - Storing the Catalog to Postgres](https://medium.com/better-programming/learning-apache-iceberg-storing-the-catalog-to-postgres-c54ef5e7c628) 9 | 3. [Learning Apache Iceberg - storing the data to Minio S3](https://medium.com/@MarinAgli1/learning-apache-iceberg-storing-the-data-to-minio-s3-56670cef199d) 10 | 4. [Learning Apache Iceberg - looking at append, update, and delete operations](https://medium.com/@MarinAgli1/learning-apache-iceberg-looking-at-append-update-and-delete-operations-179ad63cb6cb) -------------------------------------------------------------------------------- /spark/spark-defaults-minio.conf: -------------------------------------------------------------------------------- 1 | spark.master spark://spark-iceberg:7077 2 | spark.eventLog.enabled true 3 | spark.eventLog.dir /opt/spark/spark-events 4 | spark.history.fs.logDirectory /opt/spark/spark-events 5 | spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions 6 | spark.sql.catalog.data org.apache.iceberg.spark.SparkCatalog 7 | spark.sql.catalog.data.warehouse s3://iceberg-data 8 | spark.sql.catalog.data.s3.endpoint http://minio-s3:9000 9 | spark.sql.catalog.data.io-impl org.apache.iceberg.aws.s3.S3FileIO 10 | spark.sql.catalog.data.catalog-impl org.apache.iceberg.jdbc.JdbcCatalog 11 | spark.sql.catalog.data.uri jdbc:postgresql://pg-catalog:5432/iceberg 12 | spark.sql.catalog.data.jdbc.user iceberg 13 | spark.sql.catalog.data.jdbc.password iceberg 14 | spark.sql.defaultCatalog data 15 | spark.sql.catalogImplementation in-memory 16 | -------------------------------------------------------------------------------- /medium-data/transformed_avro_files/manifest_lists/snap-2032559078621466157-1-274b9f78-7835-494a-88e7-c4a7fbc87659.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "manifest_path": "s3://iceberg-data/db/minio/metadata/274b9f78-7835-494a-88e7-c4a7fbc87659-m0.avro", 4 | "manifest_length": 6947, 5 | "partition_spec_id": 0, 6 | "content": 0, 7 | "sequence_number": 2, 8 | "min_sequence_number": 2, 9 | "added_snapshot_id": 2032559078621466157, 10 | "added_data_files_count": 1, 11 | "existing_data_files_count": 0, 12 | "deleted_data_files_count": 0, 13 | "added_rows_count": 1, 14 | "existing_rows_count": 0, 15 | "deleted_rows_count": 0, 16 | "partitions": [] 17 | }, 18 | { 19 | "manifest_path": "s3://iceberg-data/db/minio/metadata/b10658da-8308-41c7-9209-217725fa8660-m0.avro", 20 | "manifest_length": 7060, 21 | "partition_spec_id": 0, 22 | "content": 0, 23 | "sequence_number": 1, 24 | "min_sequence_number": 1, 25 | "added_snapshot_id": 7426647800932772370, 26 | "added_data_files_count": 2, 27 | "existing_data_files_count": 0, 28 | "deleted_data_files_count": 0, 29 | "added_rows_count": 5, 30 | "existing_rows_count": 0, 31 | "deleted_rows_count": 0, 32 | "partitions": [] 33 | } 34 | ] -------------------------------------------------------------------------------- /minio/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bins/sh 2 | 3 | echo "INPUT_BUCKETS=$INPUT_BUCKETS" 4 | echo "COPY_DIR=$COPY_DIR" 5 | 6 | while ! /usr/bin/mc config host add minios3 http://minio-s3:9000 $USER $PASSWORD; 7 | do echo 'MinIO not up and running yet...' && sleep 1; 8 | done; 9 | 10 | echo 'Added mc host config.'; 11 | 12 | echo "Variable COPY_DIR is set to $COPY_DIR" 13 | 14 | mc alias list 15 | 16 | if [ "$COPY_DIR" = "true" ] 17 | then 18 | bucket="data" 19 | 20 | /usr/bin/mc mb "minios3/$bucket"; 21 | /usr/bin/mc mirror /data/ "minios3/$bucket"; 22 | else 23 | buckets=( $(echo $INPUT_BUCKETS | tr "," " ") ) 24 | copy_data=( $(echo $COPY_DATA | tr "," " ") ) 25 | 26 | length_buckets=${#buckets[@]} 27 | length_data=${#copy_data[@]} 28 | echo "The length of the arrays is: $length_buckets and $length_data" 29 | 30 | end=$((length_buckets - 1)) 31 | 32 | for i in $(seq 0 $end); 33 | do 34 | bucket=${buckets[i]} 35 | bucket=$(echo "$bucket" | tr -d "[:blank:]()") 36 | 37 | file=${copy_data[i]} 38 | # shellcheck disable=SC2006 39 | file=`echo "$file" | tr -d "[:blank:]()"` 40 | 41 | echo "Printing file and bucket" 42 | echo "$file" and "$bucket" 43 | 44 | /usr/bin/mc mb "minios3/$bucket" 45 | mc stat --json minios3/iceberg-data 46 | 47 | if [ -n "${file}" ] 48 | then 49 | echo "ENTERED IF" 50 | /usr/bin/mc cp "/data/$file" "minios3/$bucket" 51 | fi 52 | done; 53 | fi 54 | 55 | echo "Bucket created" 56 | 57 | mc ls --json minios3 58 | 59 | 60 | 61 | exit 0; 62 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | spark-iceberg: 5 | image: spark-iceberg 6 | container_name: spark-iceberg 7 | build: ./spark 8 | entrypoint: ['./entrypoint.sh', 'master' ] 9 | env_file: 10 | - spark/.env 11 | volumes: 12 | - ./warehouse:/home/iceberg/warehouse 13 | - ./notebooks:/home/iceberg/notebooks 14 | - ./data:/opt/spark/data 15 | - ./spark_apps:/opt/spark/apps 16 | - spark-logs:/opt/spark/spark-events 17 | ports: 18 | - '8888:8888' 19 | - '8080:8080' 20 | - '10000:10000' 21 | - '10001:10001' 22 | 23 | 24 | spark-worker: 25 | image: spark-iceberg 26 | container_name: spark-worker 27 | entrypoint: [ './entrypoint.sh', 'worker' ] 28 | depends_on: 29 | - spark-iceberg 30 | env_file: 31 | - spark/.env 32 | environment: 33 | - SPARK_NO_DAEMONIZE=true 34 | volumes: 35 | - ./data:/opt/spark/data 36 | - ./warehouse:/home/iceberg/warehouse 37 | - ./spark_apps:/opt/spark/apps 38 | - spark-logs:/opt/spark/spark-events 39 | ports: 40 | - '8081:8081' 41 | 42 | spark-history-server: 43 | container_name: spark-history 44 | image: spark-iceberg 45 | entrypoint: [ './entrypoint.sh', 'history' ] 46 | depends_on: 47 | - spark-iceberg 48 | env_file: 49 | - spark/.env 50 | environment: 51 | - SPARK_NO_DAEMONIZE=true 52 | volumes: 53 | - spark-logs:/opt/spark/spark-events 54 | ports: 55 | - '18080:18080' 56 | 57 | 58 | volumes: 59 | spark-logs: 60 | -------------------------------------------------------------------------------- /minio/README.md: -------------------------------------------------------------------------------- 1 | # Setting up minio buckets and data 2 | 3 | The `entrypoint.sh` script is based upon the commands that 4 | I found while reading the book Data Pipelines with Apache 5 | Airflow (I actually think that I expanded on it a fair 6 | amount). 7 | The code for the book is here: 8 | https://github.com/BasPH/data-pipelines-with-apache-airflow 9 | You can get the book here: 10 | https://www.manning.com/books/data-pipelines-with-apache-airflow 11 | 12 | The shell commands in the book are given in the docker-compose 13 | file. I extracted them to the `entrypoint.sh` file and expanded 14 | the file a bit. 15 | 16 | It supports the following options: 17 | - `PASSWORD` - to set the password for the minio user. 18 | - `COPY_DIR` - when set to true, the dataset folder is mirrored 19 | to minio s3. If not set to true, then provide the options 20 | INPUT_BUCKETS and COPY_DATA. 21 | - `INPUT_BUCKETS` - the buckets to create 22 | - `COPY_DATA` - the data files to copy into the buckets 23 | 24 | **NOTE:** the lengths of the `INPUT_BUCKETS` and `COPY_DATA` 25 | arrays need to be the same. Each file in `COPY_DATA` will be 26 | copied to the corresponding bucket by list index. 27 | Elements are separated with a comma. 28 | 29 | In case that more buckets are listed than files, the 30 | shell script should create empty bucket for every element 31 | whose index is >= then the number of files listed. 32 | 33 | An example to define the two variables: 34 | - `INPUT_BUCKETS=test,foo,bar` 35 | - `COPY_DATA=movies.csv,ratings_smaller.csv,movies.csv` 36 | 37 | For `COPY_DATA` it is assumed that the files in the /data 38 | directory in the docker container. 39 | 40 | You'll also need to create a .env file. You can copy 41 | .env.backup. 42 | -------------------------------------------------------------------------------- /notebooks/read-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "694fd852-8c95-411f-864c-919941ba9d22", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark.sql import SparkSession" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "30542263-049f-49aa-9ca1-c821d5cf818e", 19 | "metadata": { 20 | "tags": [] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 25 | "\n", 26 | "spark" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "745a38ac-3160-46f4-9513-4f42f8b9fbc1", 33 | "metadata": { 34 | "tags": [] 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "res = spark.sql(\"SELECT * FROM db.test\")\n", 39 | "res.show()" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "b5d3c232-1ab7-46a8-9632-533ba7186510", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "spark.stop()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "5e0aefc3-5113-46b1-9b23-79616009cabb", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [] 59 | } 60 | ], 61 | "metadata": { 62 | "kernelspec": { 63 | "display_name": "Python 3 (ipykernel)", 64 | "language": "python", 65 | "name": "python3" 66 | }, 67 | "language_info": { 68 | "codemirror_mode": { 69 | "name": "ipython", 70 | "version": 3 71 | }, 72 | "file_extension": ".py", 73 | "mimetype": "text/x-python", 74 | "name": "python", 75 | "nbconvert_exporter": "python", 76 | "pygments_lexer": "ipython3", 77 | "version": "3.11.6" 78 | } 79 | }, 80 | "nbformat": 4, 81 | "nbformat_minor": 5 82 | } 83 | -------------------------------------------------------------------------------- /docker-compose-minio.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | pg-catalog: 5 | image: postgres:15-alpine 6 | container_name: pg_catalog 7 | networks: 8 | iceberg-net: 9 | environment: 10 | - POSTGRES_USER=iceberg 11 | - POSTGRES_PASSWORD=iceberg 12 | - POSTGRES_DB=iceberg 13 | healthcheck: 14 | test: [ "CMD", "pg_isready", "-U", "iceberg" ] 15 | interval: 5s 16 | retries: 5 17 | ports: 18 | - "5432:5432" 19 | 20 | minio-s3: 21 | image: minio/minio 22 | container_name: iceberg_s3 23 | ports: 24 | - "9000:9000" 25 | - "9001:9001" 26 | env_file: 27 | - ./minio/.env 28 | command: server --console-address ":9001" /data 29 | networks: 30 | iceberg-net: 31 | aliases: 32 | - iceberg-data.minio-s3 33 | volumes: 34 | - minio-s3-data:/data 35 | healthcheck: 36 | test: ["CMD", "mc", "ready", "local"] 37 | interval: 30s 38 | timeout: 20s 39 | retries: 3 40 | 41 | minio-s3-init: 42 | build: ./minio/ 43 | networks: 44 | iceberg-net: 45 | env_file: 46 | - ./minio/.env 47 | volumes: 48 | - ./minio/data/:/data 49 | environment: 50 | - USER=user 51 | - COPY_DIR=false 52 | - INPUT_BUCKETS=iceberg-data 53 | depends_on: 54 | - minio-s3 55 | entrypoint: /bin/sh ./entrypoint.sh 56 | 57 | 58 | spark-iceberg: 59 | build: 60 | context: ./spark 61 | args: 62 | SPARK_DEFAULTS_CONF: spark-defaults-minio.conf 63 | networks: 64 | iceberg-net: 65 | depends_on: 66 | pg-catalog: 67 | condition: service_healthy 68 | minio-s3-init: 69 | condition: service_completed_successfully 70 | 71 | spark-worker: 72 | networks: 73 | iceberg-net: 74 | 75 | spark-history-server: 76 | networks: 77 | iceberg-net: 78 | 79 | volumes: 80 | minio-s3-data: 81 | 82 | networks: 83 | iceberg-net: 84 | -------------------------------------------------------------------------------- /medium-data/transformed_avro_files/manifest_lists/snap-7825605915503001692-1-d4223bc8-a7bd-46b3-80f3-7434fb900e74.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "manifest_path": "s3://iceberg-data/db/minio/metadata/d4223bc8-a7bd-46b3-80f3-7434fb900e74-m1.avro", 4 | "manifest_length": 6976, 5 | "partition_spec_id": 0, 6 | "content": 0, 7 | "sequence_number": 3, 8 | "min_sequence_number": 3, 9 | "added_snapshot_id": 7825605915503001692, 10 | "added_data_files_count": 1, 11 | "existing_data_files_count": 0, 12 | "deleted_data_files_count": 0, 13 | "added_rows_count": 2, 14 | "existing_rows_count": 0, 15 | "deleted_rows_count": 0, 16 | "partitions": [] 17 | }, 18 | { 19 | "manifest_path": "s3://iceberg-data/db/minio/metadata/274b9f78-7835-494a-88e7-c4a7fbc87659-m0.avro", 20 | "manifest_length": 6947, 21 | "partition_spec_id": 0, 22 | "content": 0, 23 | "sequence_number": 2, 24 | "min_sequence_number": 2, 25 | "added_snapshot_id": 2032559078621466157, 26 | "added_data_files_count": 1, 27 | "existing_data_files_count": 0, 28 | "deleted_data_files_count": 0, 29 | "added_rows_count": 1, 30 | "existing_rows_count": 0, 31 | "deleted_rows_count": 0, 32 | "partitions": [] 33 | }, 34 | { 35 | "manifest_path": "s3://iceberg-data/db/minio/metadata/d4223bc8-a7bd-46b3-80f3-7434fb900e74-m0.avro", 36 | "manifest_length": 7075, 37 | "partition_spec_id": 0, 38 | "content": 0, 39 | "sequence_number": 3, 40 | "min_sequence_number": 1, 41 | "added_snapshot_id": 7825605915503001692, 42 | "added_data_files_count": 0, 43 | "existing_data_files_count": 1, 44 | "deleted_data_files_count": 1, 45 | "added_rows_count": 0, 46 | "existing_rows_count": 2, 47 | "deleted_rows_count": 3, 48 | "partitions": [] 49 | } 50 | ] -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #postgres catalog instructions 2 | down-pg-catalog: 3 | docker compose -f docker-compose.yml -f docker-compose-pg-catalog.yml down 4 | 5 | 6 | start-pg-catalog: 7 | make stop-pg-catalog && docker compose -f docker-compose.yml -f docker-compose-pg-catalog.yml up 8 | 9 | stop-pg-catalog: 10 | docker compose -f docker-compose.yml -f docker-compose-pg-catalog.yml stop 11 | 12 | run-pg-catalog: 13 | make stop-pg-catalog && docker compose -f docker-compose.yml -f docker-compose-pg-catalog.yml up 14 | 15 | spark-build-services-pg-catalog: 16 | docker compose -f docker-compose.yml -f docker-compose-pg-catalog.yml build spark-iceberg spark-worker spark-history-server --no-cache 17 | 18 | build-pg-catalog: 19 | make down-pg-catalog && docker compose -f docker-compose.yml -f docker-compose-pg-catalog.yml build 20 | 21 | clean-pg-catalog: 22 | docker compose -f docker-compose.yml -f docker-compose-pg-catalog.yml down --rmi="all" --volumes 23 | 24 | 25 | #iceberg with minio s3 storage and pg catalog instructions 26 | down-iceberg-minio: 27 | docker compose -f docker-compose.yml -f docker-compose-minio.yml down 28 | 29 | 30 | start-iceberg-minio: 31 | make stop-iceberg-minio && docker compose -f docker-compose.yml -f docker-compose-minio.yml up 32 | 33 | stop-iceberg-minio: 34 | docker compose -f docker-compose.yml -f docker-compose-minio.yml stop 35 | 36 | run-iceberg-minio: 37 | make stop-iceberg-minio && docker compose -f docker-compose.yml -f docker-compose-minio.yml up 38 | 39 | build-services-spark-iceberg-minio: 40 | docker compose -f docker-compose.yml -f docker-compose-minio.yml build minio-s3 spark-iceberg spark-worker spark-history-server --no-cache 41 | 42 | build-iceberg-minio: 43 | make down-iceberg-minio && docker compose -f docker-compose.yml -f docker-compose-minio.yml build 44 | 45 | 46 | clean-iceberg-minio: 47 | docker compose -f docker-compose.yml -f docker-compose-minio.yml down --rmi="all" --volumes 48 | 49 | 50 | # minio s3 storage only instructions 51 | build-minio: 52 | docker compose -f docker-compose.yml -f docker-compose-minio.yml build minio-s3 minio-s3-init --no-cache 53 | 54 | run-minio: 55 | make down-iceberg-minio && docker compose -f docker-compose.yml -f docker-compose-minio.yml up minio-s3 minio-s3-init 56 | 57 | start-s3-storage: 58 | docker compose -f docker-compose.yml -f docker-compose-minio.yml up minio-s3 59 | -------------------------------------------------------------------------------- /helper_scripts/print_avro_contents.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | import os 4 | from pathlib import Path 5 | 6 | import click 7 | from avro.datafile import DataFileReader 8 | from avro.io import DatumReader 9 | from avro.schema import make_avsc_object 10 | 11 | 12 | def byteToStr(input_dict): 13 | str_dict = str(input_dict) 14 | str_dict = str_dict.replace("b'", "'").replace("'", "\"") 15 | return ast.literal_eval(str_dict) 16 | 17 | 18 | def delete_non_python_files(): 19 | current_dir = os.path.dirname(os.path.realpath(__file__)) 20 | 21 | print(current_dir) 22 | 23 | for filename in os.listdir(current_dir): 24 | if not filename.endswith("py"): 25 | os.remove(f"{current_dir}/{filename}") 26 | 27 | def read_file(filepath, schema): 28 | p = Path(filepath) 29 | 30 | with open(filepath, 'rb') as f: 31 | avro_reader = DataFileReader(f, DatumReader(schema)) 32 | rows = [row for row in avro_reader] 33 | 34 | str_rows = [byteToStr(row) for row in rows] 35 | 36 | name = p.name.split(".avro")[0] 37 | 38 | with open(f"helper_scripts/{name}.json", "w") as f: 39 | json.dump(str_rows, f, indent=4) 40 | 41 | 42 | @click.command() 43 | @click.option('--path', help="Provide file path") 44 | @click.option('--all-dir', help="Provide directory path") 45 | @click.option('--clear', is_flag=True) 46 | def print_avro(path: str, all_dir: str, clear: bool): 47 | if (path is None) == (all_dir is None): 48 | raise click.UsageError("one of --path or --all-dir should be provided") 49 | 50 | if clear: 51 | delete_non_python_files() 52 | 53 | p = Path(path or all_dir) 54 | 55 | schema = { 56 | "type": "record", 57 | "name": "Employee", 58 | "fields": [ 59 | {"name": "firstname", "type": "string"}, 60 | {"name": "middlename", "type": "string"}, 61 | {"name": "lastname", "type": "string"}, 62 | {"name": "id", "type": "string"}, 63 | {"name": "gender", "type": "string"}, 64 | {"name": "salary", "type": "int"}, 65 | ] 66 | } 67 | 68 | schema = make_avsc_object(schema) 69 | 70 | if p.is_dir(): 71 | files = [file for file in p.glob("*.avro")] 72 | else: 73 | files = [p] 74 | 75 | for file in files: 76 | read_file(file, schema) 77 | 78 | 79 | 80 | if __name__ == '__main__': 81 | print_avro() 82 | -------------------------------------------------------------------------------- /medium-data/transformed_avro_files/manifest_lists/snap-739442481904053118-1-c5934717-68b5-4262-8218-a2de395ba51e.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "manifest_path": "s3://iceberg-data/db/minio/metadata/c5934717-68b5-4262-8218-a2de395ba51e-m1.avro", 4 | "manifest_length": 6947, 5 | "partition_spec_id": 0, 6 | "content": 0, 7 | "sequence_number": 4, 8 | "min_sequence_number": 4, 9 | "added_snapshot_id": 739442481904053118, 10 | "added_data_files_count": 1, 11 | "existing_data_files_count": 0, 12 | "deleted_data_files_count": 0, 13 | "added_rows_count": 1, 14 | "existing_rows_count": 0, 15 | "deleted_rows_count": 0, 16 | "partitions": [] 17 | }, 18 | { 19 | "manifest_path": "s3://iceberg-data/db/minio/metadata/d4223bc8-a7bd-46b3-80f3-7434fb900e74-m1.avro", 20 | "manifest_length": 6976, 21 | "partition_spec_id": 0, 22 | "content": 0, 23 | "sequence_number": 3, 24 | "min_sequence_number": 3, 25 | "added_snapshot_id": 7825605915503001692, 26 | "added_data_files_count": 1, 27 | "existing_data_files_count": 0, 28 | "deleted_data_files_count": 0, 29 | "added_rows_count": 2, 30 | "existing_rows_count": 0, 31 | "deleted_rows_count": 0, 32 | "partitions": [] 33 | }, 34 | { 35 | "manifest_path": "s3://iceberg-data/db/minio/metadata/c5934717-68b5-4262-8218-a2de395ba51e-m0.avro", 36 | "manifest_length": 6948, 37 | "partition_spec_id": 0, 38 | "content": 0, 39 | "sequence_number": 4, 40 | "min_sequence_number": 4, 41 | "added_snapshot_id": 739442481904053118, 42 | "added_data_files_count": 0, 43 | "existing_data_files_count": 0, 44 | "deleted_data_files_count": 1, 45 | "added_rows_count": 0, 46 | "existing_rows_count": 0, 47 | "deleted_rows_count": 1, 48 | "partitions": [] 49 | }, 50 | { 51 | "manifest_path": "s3://iceberg-data/db/minio/metadata/d4223bc8-a7bd-46b3-80f3-7434fb900e74-m0.avro", 52 | "manifest_length": 7075, 53 | "partition_spec_id": 0, 54 | "content": 0, 55 | "sequence_number": 3, 56 | "min_sequence_number": 1, 57 | "added_snapshot_id": 7825605915503001692, 58 | "added_data_files_count": 0, 59 | "existing_data_files_count": 1, 60 | "deleted_data_files_count": 1, 61 | "added_rows_count": 0, 62 | "existing_rows_count": 2, 63 | "deleted_rows_count": 3, 64 | "partitions": [] 65 | } 66 | ] -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/00000-f5b4c31a-cf6e-4723-a086-0c66c6a102f4.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "format-version" : 2, 3 | "table-uuid" : "0016734d-5067-4b3c-be6f-4ca824f900fc", 4 | "location" : "s3://iceberg-data/db/minio", 5 | "last-sequence-number" : 1, 6 | "last-updated-ms" : 1706516546185, 7 | "last-column-id" : 6, 8 | "current-schema-id" : 0, 9 | "schemas" : [ { 10 | "type" : "struct", 11 | "schema-id" : 0, 12 | "fields" : [ { 13 | "id" : 1, 14 | "name" : "firstname", 15 | "required" : false, 16 | "type" : "string" 17 | }, { 18 | "id" : 2, 19 | "name" : "middlename", 20 | "required" : false, 21 | "type" : "string" 22 | }, { 23 | "id" : 3, 24 | "name" : "lastname", 25 | "required" : false, 26 | "type" : "string" 27 | }, { 28 | "id" : 4, 29 | "name" : "id", 30 | "required" : false, 31 | "type" : "string" 32 | }, { 33 | "id" : 5, 34 | "name" : "gender", 35 | "required" : false, 36 | "type" : "string" 37 | }, { 38 | "id" : 6, 39 | "name" : "salary", 40 | "required" : false, 41 | "type" : "int" 42 | } ] 43 | } ], 44 | "default-spec-id" : 0, 45 | "partition-specs" : [ { 46 | "spec-id" : 0, 47 | "fields" : [ ] 48 | } ], 49 | "last-partition-id" : 999, 50 | "default-sort-order-id" : 0, 51 | "sort-orders" : [ { 52 | "order-id" : 0, 53 | "fields" : [ ] 54 | } ], 55 | "properties" : { 56 | "owner" : "root", 57 | "write.parquet.compression-codec" : "zstd" 58 | }, 59 | "current-snapshot-id" : 7426647800932772370, 60 | "refs" : { 61 | "main" : { 62 | "snapshot-id" : 7426647800932772370, 63 | "type" : "branch" 64 | } 65 | }, 66 | "snapshots" : [ { 67 | "sequence-number" : 1, 68 | "snapshot-id" : 7426647800932772370, 69 | "timestamp-ms" : 1706516546185, 70 | "summary" : { 71 | "operation" : "append", 72 | "spark.app.id" : "app-20240129082210-0000", 73 | "added-data-files" : "2", 74 | "added-records" : "5", 75 | "added-files-size" : "3365", 76 | "changed-partition-count" : "1", 77 | "total-records" : "5", 78 | "total-files-size" : "3365", 79 | "total-data-files" : "2", 80 | "total-delete-files" : "0", 81 | "total-position-deletes" : "0", 82 | "total-equality-deletes" : "0" 83 | }, 84 | "manifest-list" : "s3://iceberg-data/db/minio/metadata/snap-7426647800932772370-1-b10658da-8308-41c7-9209-217725fa8660.avro", 85 | "schema-id" : 0 86 | } ], 87 | "statistics" : [ ], 88 | "snapshot-log" : [ { 89 | "timestamp-ms" : 1706516546185, 90 | "snapshot-id" : 7426647800932772370 91 | } ], 92 | "metadata-log" : [ ] 93 | } -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/iceberg-getting-started-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "694fd852-8c95-411f-864c-919941ba9d22", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark.sql import SparkSession\n", 13 | "from pyspark.sql.types import StructType,StructField, StringType, IntegerType" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "30542263-049f-49aa-9ca1-c821d5cf818e", 20 | "metadata": { 21 | "tags": [] 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 26 | "\n", 27 | "spark" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "ebce8acc-912a-4e17-a0af-805dc7117570", 34 | "metadata": { 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "data = [(\"James\",\"\",\"Smith\",\"36636\",\"M\",3000),\n", 40 | " (\"Michael\",\"Rose\",\"\",\"40288\",\"M\",4000),\n", 41 | " (\"Robert\",\"\",\"Williams\",\"42114\",\"M\",4000),\n", 42 | " (\"Maria\",\"Anne\",\"Jones\",\"39192\",\"F\",4000),\n", 43 | " (\"Jen\",\"Mary\",\"Brown\",\"\",\"F\",-1)\n", 44 | " ]\n", 45 | "\n", 46 | "schema = StructType([ \\\n", 47 | " StructField(\"firstname\",StringType(),True), \\\n", 48 | " StructField(\"middlename\",StringType(),True), \\\n", 49 | " StructField(\"lastname\",StringType(),True), \\\n", 50 | " StructField(\"id\", StringType(), True), \\\n", 51 | " StructField(\"gender\", StringType(), True), \\\n", 52 | " StructField(\"salary\", IntegerType(), True) \\\n", 53 | " ])" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "6d06e12f-ec86-4935-a065-68901e98c8b9", 60 | "metadata": { 61 | "tags": [] 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "555b9977-8792-4476-8110-25f62bd981ff", 72 | "metadata": { 73 | "tags": [] 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "df = spark.createDataFrame(data=data, schema=schema)\n", 78 | "df.printSchema()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "56bce0e3-bbc1-4531-ad29-6d7a36eea73c", 85 | "metadata": { 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "df.writeTo(\"db.test\").create()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "745a38ac-3160-46f4-9513-4f42f8b9fbc1", 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [] 100 | } 101 | ], 102 | "metadata": { 103 | "kernelspec": { 104 | "display_name": "Python 3 (ipykernel)", 105 | "language": "python", 106 | "name": "python3" 107 | }, 108 | "language_info": { 109 | "codemirror_mode": { 110 | "name": "ipython", 111 | "version": 3 112 | }, 113 | "file_extension": ".py", 114 | "mimetype": "text/x-python", 115 | "name": "python", 116 | "nbconvert_exporter": "python", 117 | "pygments_lexer": "ipython3", 118 | "version": "3.10.10" 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 5 123 | } 124 | -------------------------------------------------------------------------------- /notebooks/getting-started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "694fd852-8c95-411f-864c-919941ba9d22", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark.sql import SparkSession\n", 13 | "from pyspark.sql.types import StructType,StructField, StringType, IntegerType" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "30542263-049f-49aa-9ca1-c821d5cf818e", 20 | "metadata": { 21 | "tags": [] 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 26 | "\n", 27 | "spark" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "ebce8acc-912a-4e17-a0af-805dc7117570", 34 | "metadata": { 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "data = [(\"James\",\"\",\"Smith\",\"36636\",\"M\",3000),\n", 40 | " (\"Michael\",\"Rose\",\"\",\"40288\",\"M\",4000),\n", 41 | " (\"Robert\",\"\",\"Williams\",\"42114\",\"M\",4000),\n", 42 | " (\"Maria\",\"Anne\",\"Jones\",\"39192\",\"F\",4000),\n", 43 | " (\"Jen\",\"Mary\",\"Brown\",\"\",\"F\",-1)\n", 44 | " ]\n", 45 | "\n", 46 | "schema = StructType([ \\\n", 47 | " StructField(\"firstname\",StringType(),True), \\\n", 48 | " StructField(\"middlename\",StringType(),True), \\\n", 49 | " StructField(\"lastname\",StringType(),True), \\\n", 50 | " StructField(\"id\", StringType(), True), \\\n", 51 | " StructField(\"gender\", StringType(), True), \\\n", 52 | " StructField(\"salary\", IntegerType(), True) \\\n", 53 | " ])" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "6d06e12f-ec86-4935-a065-68901e98c8b9", 60 | "metadata": { 61 | "tags": [] 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "555b9977-8792-4476-8110-25f62bd981ff", 72 | "metadata": { 73 | "tags": [] 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "df = spark.createDataFrame(data=data, schema=schema)\n", 78 | "df.printSchema()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "56bce0e3-bbc1-4531-ad29-6d7a36eea73c", 85 | "metadata": { 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "df.writeTo(\"db.test\").createOrReplace()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "b5d3c232-1ab7-46a8-9632-533ba7186510", 97 | "metadata": { 98 | "tags": [] 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "spark.stop()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "bb2e01b8-8761-4eab-a3a9-ecf5d03eae3f", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3 (ipykernel)", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.11.7" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 5 135 | } 136 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/getting-started-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "694fd852-8c95-411f-864c-919941ba9d22", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark.sql import SparkSession\n", 13 | "from pyspark.sql.types import StructType,StructField, StringType, IntegerType" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "30542263-049f-49aa-9ca1-c821d5cf818e", 20 | "metadata": { 21 | "tags": [] 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 26 | "\n", 27 | "spark" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "ebce8acc-912a-4e17-a0af-805dc7117570", 34 | "metadata": { 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "data = [(\"James\",\"\",\"Smith\",\"36636\",\"M\",3000),\n", 40 | " (\"Michael\",\"Rose\",\"\",\"40288\",\"M\",4000),\n", 41 | " (\"Robert\",\"\",\"Williams\",\"42114\",\"M\",4000),\n", 42 | " (\"Maria\",\"Anne\",\"Jones\",\"39192\",\"F\",4000),\n", 43 | " (\"Jen\",\"Mary\",\"Brown\",\"\",\"F\",-1)\n", 44 | " ]\n", 45 | "\n", 46 | "schema = StructType([ \\\n", 47 | " StructField(\"firstname\",StringType(),True), \\\n", 48 | " StructField(\"middlename\",StringType(),True), \\\n", 49 | " StructField(\"lastname\",StringType(),True), \\\n", 50 | " StructField(\"id\", StringType(), True), \\\n", 51 | " StructField(\"gender\", StringType(), True), \\\n", 52 | " StructField(\"salary\", IntegerType(), True) \\\n", 53 | " ])" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "6d06e12f-ec86-4935-a065-68901e98c8b9", 60 | "metadata": { 61 | "tags": [] 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "555b9977-8792-4476-8110-25f62bd981ff", 72 | "metadata": { 73 | "tags": [] 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "df = spark.createDataFrame(data=data, schema=schema)\n", 78 | "df.printSchema()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "56bce0e3-bbc1-4531-ad29-6d7a36eea73c", 85 | "metadata": { 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "df.writeTo(\"db.test\").createOrReplace()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "b5d3c232-1ab7-46a8-9632-533ba7186510", 97 | "metadata": { 98 | "tags": [] 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "spark.stop()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "bb2e01b8-8761-4eab-a3a9-ecf5d03eae3f", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3 (ipykernel)", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.11.7" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 5 135 | } 136 | -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/00001-4c5adbca-0447-491b-999c-50c28051f24d.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "format-version" : 2, 3 | "table-uuid" : "0016734d-5067-4b3c-be6f-4ca824f900fc", 4 | "location" : "s3://iceberg-data/db/minio", 5 | "last-sequence-number" : 2, 6 | "last-updated-ms" : 1706518166494, 7 | "last-column-id" : 6, 8 | "current-schema-id" : 0, 9 | "schemas" : [ { 10 | "type" : "struct", 11 | "schema-id" : 0, 12 | "fields" : [ { 13 | "id" : 1, 14 | "name" : "firstname", 15 | "required" : false, 16 | "type" : "string" 17 | }, { 18 | "id" : 2, 19 | "name" : "middlename", 20 | "required" : false, 21 | "type" : "string" 22 | }, { 23 | "id" : 3, 24 | "name" : "lastname", 25 | "required" : false, 26 | "type" : "string" 27 | }, { 28 | "id" : 4, 29 | "name" : "id", 30 | "required" : false, 31 | "type" : "string" 32 | }, { 33 | "id" : 5, 34 | "name" : "gender", 35 | "required" : false, 36 | "type" : "string" 37 | }, { 38 | "id" : 6, 39 | "name" : "salary", 40 | "required" : false, 41 | "type" : "int" 42 | } ] 43 | } ], 44 | "default-spec-id" : 0, 45 | "partition-specs" : [ { 46 | "spec-id" : 0, 47 | "fields" : [ ] 48 | } ], 49 | "last-partition-id" : 999, 50 | "default-sort-order-id" : 0, 51 | "sort-orders" : [ { 52 | "order-id" : 0, 53 | "fields" : [ ] 54 | } ], 55 | "properties" : { 56 | "owner" : "root", 57 | "write.parquet.compression-codec" : "zstd" 58 | }, 59 | "current-snapshot-id" : 2032559078621466157, 60 | "refs" : { 61 | "main" : { 62 | "snapshot-id" : 2032559078621466157, 63 | "type" : "branch" 64 | } 65 | }, 66 | "snapshots" : [ { 67 | "sequence-number" : 1, 68 | "snapshot-id" : 7426647800932772370, 69 | "timestamp-ms" : 1706516546185, 70 | "summary" : { 71 | "operation" : "append", 72 | "spark.app.id" : "app-20240129082210-0000", 73 | "added-data-files" : "2", 74 | "added-records" : "5", 75 | "added-files-size" : "3365", 76 | "changed-partition-count" : "1", 77 | "total-records" : "5", 78 | "total-files-size" : "3365", 79 | "total-data-files" : "2", 80 | "total-delete-files" : "0", 81 | "total-position-deletes" : "0", 82 | "total-equality-deletes" : "0" 83 | }, 84 | "manifest-list" : "s3://iceberg-data/db/minio/metadata/snap-7426647800932772370-1-b10658da-8308-41c7-9209-217725fa8660.avro", 85 | "schema-id" : 0 86 | }, { 87 | "sequence-number" : 2, 88 | "snapshot-id" : 2032559078621466157, 89 | "parent-snapshot-id" : 7426647800932772370, 90 | "timestamp-ms" : 1706518166494, 91 | "summary" : { 92 | "operation" : "append", 93 | "spark.app.id" : "app-20240129082210-0000", 94 | "added-data-files" : "1", 95 | "added-records" : "1", 96 | "added-files-size" : "1636", 97 | "changed-partition-count" : "1", 98 | "total-records" : "6", 99 | "total-files-size" : "5001", 100 | "total-data-files" : "3", 101 | "total-delete-files" : "0", 102 | "total-position-deletes" : "0", 103 | "total-equality-deletes" : "0" 104 | }, 105 | "manifest-list" : "s3://iceberg-data/db/minio/metadata/snap-2032559078621466157-1-274b9f78-7835-494a-88e7-c4a7fbc87659.avro", 106 | "schema-id" : 0 107 | } ], 108 | "statistics" : [ ], 109 | "snapshot-log" : [ { 110 | "timestamp-ms" : 1706516546185, 111 | "snapshot-id" : 7426647800932772370 112 | }, { 113 | "timestamp-ms" : 1706518166494, 114 | "snapshot-id" : 2032559078621466157 115 | } ], 116 | "metadata-log" : [ { 117 | "timestamp-ms" : 1706516546185, 118 | "metadata-file" : "s3://iceberg-data/db/minio/metadata/00000-f5b4c31a-cf6e-4723-a086-0c66c6a102f4.metadata.json" 119 | } ] 120 | } -------------------------------------------------------------------------------- /spark/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-bullseye as spark-base 2 | 3 | # Install tools required by the OS 4 | RUN apt-get update && \ 5 | apt-get install -y --no-install-recommends \ 6 | sudo \ 7 | curl \ 8 | vim \ 9 | unzip \ 10 | openjdk-11-jdk \ 11 | build-essential \ 12 | software-properties-common \ 13 | ssh && \ 14 | apt-get clean && \ 15 | rm -rf /var/lib/apt/lists/* 16 | 17 | 18 | # Setup the directories for our Spark and Hadoop installations 19 | ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"} 20 | ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"} 21 | ENV PYTHONPATH=$SPARK_HOME/python/:$SPARK_HOME/python/lib/py4j-0.10.9.5-src.zip:$PYTHONPATH 22 | 23 | RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} 24 | WORKDIR ${SPARK_HOME} 25 | 26 | # Download and install Spark 27 | ENV SPARK_VERSION=3.5.0 28 | 29 | ARG SPARK_DEFAULTS_CONF=spark-defaults.conf 30 | 31 | RUN curl https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ 32 | && tar xvzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \ 33 | && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz 34 | 35 | 36 | 37 | FROM spark-base as pyspark-base 38 | 39 | # Install python deps 40 | COPY requirements/requirements.txt . 41 | RUN pip3 install -r requirements.txt 42 | 43 | 44 | 45 | FROM pyspark-base as pyspark 46 | 47 | # Setup Spark related environment variables 48 | ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" 49 | ENV SPARK_MASTER="spark://spark-iceberg:7077" 50 | ENV SPARK_MASTER_HOST spark-iceberg 51 | ENV SPARK_MASTER_PORT 7077 52 | ENV PYSPARK_PYTHON python3 53 | 54 | # Copy the default configurations into $SPARK_HOME/conf 55 | #COPY spark-defaults.conf "$SPARK_HOME/conf" 56 | COPY ${SPARK_DEFAULTS_CONF} "$SPARK_HOME/conf/spark-defaults.conf" 57 | 58 | RUN chmod u+x /opt/spark/sbin/* && \ 59 | chmod u+x /opt/spark/bin/* 60 | 61 | # Copy appropriate entrypoint script 62 | COPY entrypoint.sh . 63 | 64 | ENTRYPOINT ["./entrypoint.sh"] 65 | 66 | 67 | FROM pyspark as spark-iceberg 68 | 69 | ARG SPARK_MAJOR_VERSION=3.5 70 | ARG ICEBERG_VERSION=1.4.3 71 | ARG ICEBERG_SPARK_SCALA="iceberg-spark-runtime-${SPARK_MAJOR_VERSION}_2.12" 72 | ARG JAR_PACKAGE="${ICEBERG_SPARK_SCALA}-${ICEBERG_VERSION}.jar" 73 | 74 | # Download iceberg spark runtime 75 | RUN curl https://repo1.maven.org/maven2/org/apache/iceberg/${ICEBERG_SPARK_SCALA}/${ICEBERG_VERSION}/${JAR_PACKAGE} -Lo /opt/spark/jars/${JAR_PACKAGE} 76 | 77 | # Download Java AWS bundle 78 | # RUN curl https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/2.17.257/bundle-2.17.257.jar -Lo /opt/spark/jars/bundle-2.17.257.jar 79 | RUN curl -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar 80 | 81 | # Install AWS CLI 82 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \ 83 | && unzip awscliv2.zip \ 84 | && sudo ./aws/install \ 85 | && rm awscliv2.zip \ 86 | && rm -rf aws/ 87 | 88 | 89 | # Install PostgreSQL JDBC Driver 90 | RUN curl "https://jdbc.postgresql.org/download/postgresql-42.6.0.jar" -o "postgresql-42.6.0.jar" \ 91 | && mv postgresql-42.6.0.jar "${SPARK_HOME}/jars/postgresql-42.6.0.jar" 92 | 93 | # Add iceberg spark runtime jar to IJava classpath 94 | ENV IJAVA_CLASSPATH=/opt/spark/jars/* 95 | 96 | RUN mkdir -p /home/iceberg/localwarehouse /home/iceberg/notebooks /home/iceberg/warehouse /home/iceberg/spark-events /home/iceberg 97 | 98 | ARG jupyterlab_version=4.0.8 99 | 100 | RUN apt-get update -y && \ 101 | apt-get install -y python3-pip python3-dev && \ 102 | pip3 install --upgrade pip && \ 103 | pip3 install wget jupyterlab==${jupyterlab_version} 104 | 105 | # Add a notebook command 106 | RUN echo '#! /bin/sh' >> /bin/notebook \ 107 | && echo 'export PYSPARK_DRIVER_PYTHON=jupyter' >> /bin/notebook \ 108 | && echo "export PYSPARK_DRIVER_PYTHON_OPTS=\"lab --notebook-dir=/home/iceberg/notebooks --ip='0.0.0.0' --NotebookApp.token='' --port=8888 --no-browser --allow-root\"" >> /bin/notebook \ 109 | # && echo 'pyspark --master local[*]' >> /bin/notebook \ 110 | && echo 'pyspark' >> /bin/notebook \ 111 | && chmod u+x /bin/notebook 112 | 113 | 114 | ENTRYPOINT ["./entrypoint.sh"] 115 | CMD ["notebook"] 116 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/inspect-catalogs-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "694fd852-8c95-411f-864c-919941ba9d22", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark.sql import SparkSession\n", 13 | "from pyspark.sql.types import StructType,StructField, StringType, IntegerType\n", 14 | "from pyspark.conf import SparkConf" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "30542263-049f-49aa-9ca1-c821d5cf818e", 21 | "metadata": { 22 | "tags": [] 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 27 | "\n", 28 | "spark.stop()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "c17dad7b-c9ad-4c89-a695-6038170df7f3", 35 | "metadata": { 36 | "tags": [] 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "from pprint import pprint\n", 41 | "\n", 42 | "conf = SparkConf().set(\"spark.sql.catalog.spark_catalog.defaultDatabase\", \"db\")\n", 43 | "\n", 44 | "spark = SparkSession.builder.config(conf=conf).appName(\"Jupyter\").getOrCreate()\n", 45 | "\n", 46 | "pprint(spark.sparkContext.getConf().getAll())\n", 47 | "\n", 48 | "spark" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "fb9bb08e-33fa-449b-8b61-83a84cf1e7f1", 55 | "metadata": { 56 | "tags": [] 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "spark.catalog" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "2ddb296d-5c5c-4cd5-b07d-635d1635f172", 67 | "metadata": { 68 | "tags": [] 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "spark.catalog.currentDatabase()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "id": "12a15e69-af48-4458-9212-627fdaa0c829", 79 | "metadata": { 80 | "tags": [] 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "spark.catalog.listTables()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "c45998b3-985c-49f3-8e9f-1125582b8cd1", 91 | "metadata": { 92 | "tags": [] 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "spark.catalog.listDatabases()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "69c6ee8a-da4e-4827-8ea5-f8cd04aad5c0", 103 | "metadata": { 104 | "tags": [] 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "spark.sql('show tables from db').show()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "1cd01fa6-9f46-44e2-907f-250c062bb0e1", 115 | "metadata": { 116 | "tags": [] 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "df = spark.table(\"data.db.pg_catalog\")" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "5fbb83a7-31f4-4267-84d9-756c72049a16", 127 | "metadata": { 128 | "tags": [] 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "df" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "31642dc0-dff1-49f3-87da-6cd662857af8", 139 | "metadata": { 140 | "tags": [] 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "df.show()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "a2c7d048-d502-4b66-a982-5d39767be6ea", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "spark.stop()" 155 | ] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "display_name": "Python 3 (ipykernel)", 161 | "language": "python", 162 | "name": "python3" 163 | }, 164 | "language_info": { 165 | "codemirror_mode": { 166 | "name": "ipython", 167 | "version": 3 168 | }, 169 | "file_extension": ".py", 170 | "mimetype": "text/x-python", 171 | "name": "python", 172 | "nbconvert_exporter": "python", 173 | "pygments_lexer": "ipython3", 174 | "version": "3.10.11" 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 5 179 | } 180 | -------------------------------------------------------------------------------- /notebooks/inspect-catalogs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "694fd852-8c95-411f-864c-919941ba9d22", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark.sql import SparkSession\n", 13 | "from pyspark.sql.types import StructType,StructField, StringType, IntegerType\n", 14 | "from pyspark.conf import SparkConf" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "30542263-049f-49aa-9ca1-c821d5cf818e", 21 | "metadata": { 22 | "tags": [] 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 27 | "\n", 28 | "spark.stop()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "c17dad7b-c9ad-4c89-a695-6038170df7f3", 35 | "metadata": { 36 | "tags": [] 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "from pprint import pprint\n", 41 | "\n", 42 | "conf = SparkConf().set(\"spark.sql.catalog.spark_catalog.defaultDatabase\", \"db\")\n", 43 | "\n", 44 | "spark = SparkSession.builder.config(conf=conf).appName(\"Jupyter\").getOrCreate()\n", 45 | "\n", 46 | "pprint(spark.sparkContext.getConf().getAll())\n", 47 | "\n", 48 | "spark" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "fb9bb08e-33fa-449b-8b61-83a84cf1e7f1", 55 | "metadata": { 56 | "tags": [] 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "spark.catalog" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "2ddb296d-5c5c-4cd5-b07d-635d1635f172", 67 | "metadata": { 68 | "tags": [] 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "spark.catalog.currentDatabase()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "id": "12a15e69-af48-4458-9212-627fdaa0c829", 79 | "metadata": { 80 | "tags": [] 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "spark.catalog.listTables()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "c45998b3-985c-49f3-8e9f-1125582b8cd1", 91 | "metadata": { 92 | "tags": [] 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "spark.catalog.listDatabases()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "69c6ee8a-da4e-4827-8ea5-f8cd04aad5c0", 103 | "metadata": { 104 | "tags": [] 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "spark.sql('show tables from db').show()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "1cd01fa6-9f46-44e2-907f-250c062bb0e1", 115 | "metadata": { 116 | "tags": [] 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "df = spark.table(\"data.db.pg_catalog\")" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "5fbb83a7-31f4-4267-84d9-756c72049a16", 127 | "metadata": { 128 | "tags": [] 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "df" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "31642dc0-dff1-49f3-87da-6cd662857af8", 139 | "metadata": { 140 | "tags": [] 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "df.show()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "a2c7d048-d502-4b66-a982-5d39767be6ea", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "spark.stop()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "415793dc-a28f-497c-8e37-b59b4608a261", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 3 (ipykernel)", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.11.6" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 5 187 | } 188 | -------------------------------------------------------------------------------- /medium-data/transformed_avro_files/manifests/c5934717-68b5-4262-8218-a2de395ba51e-m0.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "status": 2, 4 | "snapshot_id": 739442481904053118, 5 | "sequence_number": 2, 6 | "file_sequence_number": 2, 7 | "data_file": { 8 | "content": 0, 9 | "file_path": "s3://iceberg-data/db/minio/data/00007-9-012e468c-5234-40d8-8ce4-d7ec420a068f-00001.parquet", 10 | "file_format": "PARQUET", 11 | "partition": {}, 12 | "record_count": 1, 13 | "file_size_in_bytes": 1636, 14 | "column_sizes": [ 15 | { 16 | "key": 1, 17 | "value": 51 18 | }, 19 | { 20 | "key": 2, 21 | "value": 42 22 | }, 23 | { 24 | "key": 3, 25 | "value": 47 26 | }, 27 | { 28 | "key": 4, 29 | "value": 47 30 | }, 31 | { 32 | "key": 5, 33 | "value": 43 34 | }, 35 | { 36 | "key": 6, 37 | "value": 42 38 | } 39 | ], 40 | "value_counts": [ 41 | { 42 | "key": 1, 43 | "value": 1 44 | }, 45 | { 46 | "key": 2, 47 | "value": 1 48 | }, 49 | { 50 | "key": 3, 51 | "value": 1 52 | }, 53 | { 54 | "key": 4, 55 | "value": 1 56 | }, 57 | { 58 | "key": 5, 59 | "value": 1 60 | }, 61 | { 62 | "key": 6, 63 | "value": 1 64 | } 65 | ], 66 | "null_value_counts": [ 67 | { 68 | "key": 1, 69 | "value": 0 70 | }, 71 | { 72 | "key": 2, 73 | "value": 0 74 | }, 75 | { 76 | "key": 3, 77 | "value": 0 78 | }, 79 | { 80 | "key": 4, 81 | "value": 0 82 | }, 83 | { 84 | "key": 5, 85 | "value": 0 86 | }, 87 | { 88 | "key": 6, 89 | "value": 0 90 | } 91 | ], 92 | "nan_value_counts": [], 93 | "lower_bounds": [ 94 | { 95 | "key": 1, 96 | "value": "New James" 97 | }, 98 | { 99 | "key": 2, 100 | "value": "" 101 | }, 102 | { 103 | "key": 3, 104 | "value": "Smith" 105 | }, 106 | { 107 | "key": 4, 108 | "value": "36646" 109 | }, 110 | { 111 | "key": 5, 112 | "value": "M" 113 | }, 114 | { 115 | "key": 6, 116 | "value": "2\u0000\u0000\u0000" 117 | } 118 | ], 119 | "upper_bounds": [ 120 | { 121 | "key": 1, 122 | "value": "New James" 123 | }, 124 | { 125 | "key": 2, 126 | "value": "" 127 | }, 128 | { 129 | "key": 3, 130 | "value": "Smith" 131 | }, 132 | { 133 | "key": 4, 134 | "value": "36646" 135 | }, 136 | { 137 | "key": 5, 138 | "value": "M" 139 | }, 140 | { 141 | "key": 6, 142 | "value": "2\u0000\u0000\u0000" 143 | } 144 | ], 145 | "key_metadata": null, 146 | "split_offsets": [ 147 | 4 148 | ], 149 | "equality_ids": null, 150 | "sort_order_id": 0 151 | } 152 | } 153 | ] -------------------------------------------------------------------------------- /medium-data/transformed_avro_files/manifests/274b9f78-7835-494a-88e7-c4a7fbc87659-m0.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "status": 1, 4 | "snapshot_id": 2032559078621466157, 5 | "sequence_number": null, 6 | "file_sequence_number": null, 7 | "data_file": { 8 | "content": 0, 9 | "file_path": "s3://iceberg-data/db/minio/data/00007-9-012e468c-5234-40d8-8ce4-d7ec420a068f-00001.parquet", 10 | "file_format": "PARQUET", 11 | "partition": {}, 12 | "record_count": 1, 13 | "file_size_in_bytes": 1636, 14 | "column_sizes": [ 15 | { 16 | "key": 1, 17 | "value": 51 18 | }, 19 | { 20 | "key": 2, 21 | "value": 42 22 | }, 23 | { 24 | "key": 3, 25 | "value": 47 26 | }, 27 | { 28 | "key": 4, 29 | "value": 47 30 | }, 31 | { 32 | "key": 5, 33 | "value": 43 34 | }, 35 | { 36 | "key": 6, 37 | "value": 42 38 | } 39 | ], 40 | "value_counts": [ 41 | { 42 | "key": 1, 43 | "value": 1 44 | }, 45 | { 46 | "key": 2, 47 | "value": 1 48 | }, 49 | { 50 | "key": 3, 51 | "value": 1 52 | }, 53 | { 54 | "key": 4, 55 | "value": 1 56 | }, 57 | { 58 | "key": 5, 59 | "value": 1 60 | }, 61 | { 62 | "key": 6, 63 | "value": 1 64 | } 65 | ], 66 | "null_value_counts": [ 67 | { 68 | "key": 1, 69 | "value": 0 70 | }, 71 | { 72 | "key": 2, 73 | "value": 0 74 | }, 75 | { 76 | "key": 3, 77 | "value": 0 78 | }, 79 | { 80 | "key": 4, 81 | "value": 0 82 | }, 83 | { 84 | "key": 5, 85 | "value": 0 86 | }, 87 | { 88 | "key": 6, 89 | "value": 0 90 | } 91 | ], 92 | "nan_value_counts": [], 93 | "lower_bounds": [ 94 | { 95 | "key": 1, 96 | "value": "New James" 97 | }, 98 | { 99 | "key": 2, 100 | "value": "" 101 | }, 102 | { 103 | "key": 3, 104 | "value": "Smith" 105 | }, 106 | { 107 | "key": 4, 108 | "value": "36646" 109 | }, 110 | { 111 | "key": 5, 112 | "value": "M" 113 | }, 114 | { 115 | "key": 6, 116 | "value": "2\u0000\u0000\u0000" 117 | } 118 | ], 119 | "upper_bounds": [ 120 | { 121 | "key": 1, 122 | "value": "New James" 123 | }, 124 | { 125 | "key": 2, 126 | "value": "" 127 | }, 128 | { 129 | "key": 3, 130 | "value": "Smith" 131 | }, 132 | { 133 | "key": 4, 134 | "value": "36646" 135 | }, 136 | { 137 | "key": 5, 138 | "value": "M" 139 | }, 140 | { 141 | "key": 6, 142 | "value": "2\u0000\u0000\u0000" 143 | } 144 | ], 145 | "key_metadata": null, 146 | "split_offsets": [ 147 | 4 148 | ], 149 | "equality_ids": null, 150 | "sort_order_id": 0 151 | } 152 | } 153 | ] -------------------------------------------------------------------------------- /medium-data/transformed_avro_files/manifests/c5934717-68b5-4262-8218-a2de395ba51e-m1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "status": 1, 4 | "snapshot_id": 739442481904053118, 5 | "sequence_number": null, 6 | "file_sequence_number": null, 7 | "data_file": { 8 | "content": 0, 9 | "file_path": "s3://iceberg-data/db/minio/data/00000-19-b5ae6ac4-9f6e-4b63-a510-a185ecd03fee-00001.parquet", 10 | "file_format": "PARQUET", 11 | "partition": {}, 12 | "record_count": 1, 13 | "file_size_in_bytes": 1636, 14 | "column_sizes": [ 15 | { 16 | "key": 1, 17 | "value": 51 18 | }, 19 | { 20 | "key": 2, 21 | "value": 42 22 | }, 23 | { 24 | "key": 3, 25 | "value": 47 26 | }, 27 | { 28 | "key": 4, 29 | "value": 47 30 | }, 31 | { 32 | "key": 5, 33 | "value": 43 34 | }, 35 | { 36 | "key": 6, 37 | "value": 42 38 | } 39 | ], 40 | "value_counts": [ 41 | { 42 | "key": 1, 43 | "value": 1 44 | }, 45 | { 46 | "key": 2, 47 | "value": 1 48 | }, 49 | { 50 | "key": 3, 51 | "value": 1 52 | }, 53 | { 54 | "key": 4, 55 | "value": 1 56 | }, 57 | { 58 | "key": 5, 59 | "value": 1 60 | }, 61 | { 62 | "key": 6, 63 | "value": 1 64 | } 65 | ], 66 | "null_value_counts": [ 67 | { 68 | "key": 1, 69 | "value": 0 70 | }, 71 | { 72 | "key": 2, 73 | "value": 0 74 | }, 75 | { 76 | "key": 3, 77 | "value": 0 78 | }, 79 | { 80 | "key": 4, 81 | "value": 0 82 | }, 83 | { 84 | "key": 5, 85 | "value": 0 86 | }, 87 | { 88 | "key": 6, 89 | "value": 0 90 | } 91 | ], 92 | "nan_value_counts": [], 93 | "lower_bounds": [ 94 | { 95 | "key": 1, 96 | "value": "New James" 97 | }, 98 | { 99 | "key": 2, 100 | "value": "" 101 | }, 102 | { 103 | "key": 3, 104 | "value": "Smith" 105 | }, 106 | { 107 | "key": 4, 108 | "value": "36646" 109 | }, 110 | { 111 | "key": 5, 112 | "value": "M" 113 | }, 114 | { 115 | "key": 6, 116 | "value": "\u00dc\u0000\u0000\u0000" 117 | } 118 | ], 119 | "upper_bounds": [ 120 | { 121 | "key": 1, 122 | "value": "New James" 123 | }, 124 | { 125 | "key": 2, 126 | "value": "" 127 | }, 128 | { 129 | "key": 3, 130 | "value": "Smith" 131 | }, 132 | { 133 | "key": 4, 134 | "value": "36646" 135 | }, 136 | { 137 | "key": 5, 138 | "value": "M" 139 | }, 140 | { 141 | "key": 6, 142 | "value": "\u00dc\u0000\u0000\u0000" 143 | } 144 | ], 145 | "key_metadata": null, 146 | "split_offsets": [ 147 | 4 148 | ], 149 | "equality_ids": null, 150 | "sort_order_id": 0 151 | } 152 | } 153 | ] -------------------------------------------------------------------------------- /medium-data/transformed_avro_files/manifests/d4223bc8-a7bd-46b3-80f3-7434fb900e74-m1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "status": 1, 4 | "snapshot_id": 7825605915503001692, 5 | "sequence_number": null, 6 | "file_sequence_number": null, 7 | "data_file": { 8 | "content": 0, 9 | "file_path": "s3://iceberg-data/db/minio/data/00000-14-ab2a6c1e-23c1-4aa1-8d9a-09a9d7d97238-00001.parquet", 10 | "file_format": "PARQUET", 11 | "partition": {}, 12 | "record_count": 2, 13 | "file_size_in_bytes": 1656, 14 | "column_sizes": [ 15 | { 16 | "key": 1, 17 | "value": 56 18 | }, 19 | { 20 | "key": 2, 21 | "value": 50 22 | }, 23 | { 24 | "key": 3, 25 | "value": 59 26 | }, 27 | { 28 | "key": 4, 29 | "value": 56 30 | }, 31 | { 32 | "key": 5, 33 | "value": 48 34 | }, 35 | { 36 | "key": 6, 37 | "value": 72 38 | } 39 | ], 40 | "value_counts": [ 41 | { 42 | "key": 1, 43 | "value": 2 44 | }, 45 | { 46 | "key": 2, 47 | "value": 2 48 | }, 49 | { 50 | "key": 3, 51 | "value": 2 52 | }, 53 | { 54 | "key": 4, 55 | "value": 2 56 | }, 57 | { 58 | "key": 5, 59 | "value": 2 60 | }, 61 | { 62 | "key": 6, 63 | "value": 2 64 | } 65 | ], 66 | "null_value_counts": [ 67 | { 68 | "key": 1, 69 | "value": 0 70 | }, 71 | { 72 | "key": 2, 73 | "value": 0 74 | }, 75 | { 76 | "key": 3, 77 | "value": 0 78 | }, 79 | { 80 | "key": 4, 81 | "value": 0 82 | }, 83 | { 84 | "key": 5, 85 | "value": 0 86 | }, 87 | { 88 | "key": 6, 89 | "value": 0 90 | } 91 | ], 92 | "nan_value_counts": [], 93 | "lower_bounds": [ 94 | { 95 | "key": 1, 96 | "value": "Maria" 97 | }, 98 | { 99 | "key": 2, 100 | "value": "" 101 | }, 102 | { 103 | "key": 3, 104 | "value": "Jones" 105 | }, 106 | { 107 | "key": 4, 108 | "value": "39192" 109 | }, 110 | { 111 | "key": 5, 112 | "value": "F" 113 | }, 114 | { 115 | "key": 6, 116 | "value": "\u00a0\u000f\u0000\u0000" 117 | } 118 | ], 119 | "upper_bounds": [ 120 | { 121 | "key": 1, 122 | "value": "Robert" 123 | }, 124 | { 125 | "key": 2, 126 | "value": "Anne" 127 | }, 128 | { 129 | "key": 3, 130 | "value": "Williams" 131 | }, 132 | { 133 | "key": 4, 134 | "value": "42114" 135 | }, 136 | { 137 | "key": 5, 138 | "value": "M" 139 | }, 140 | { 141 | "key": 6, 142 | "value": "\u00a0\u000f\u0000\u0000" 143 | } 144 | ], 145 | "key_metadata": null, 146 | "split_offsets": [ 147 | 4 148 | ], 149 | "equality_ids": null, 150 | "sort_order_id": 0 151 | } 152 | } 153 | ] -------------------------------------------------------------------------------- /notebooks/add-modify-remove-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "694fd852-8c95-411f-864c-919941ba9d22", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark.sql import SparkSession\n", 13 | "from pyspark.sql.types import StructType,StructField, StringType, IntegerType" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "30542263-049f-49aa-9ca1-c821d5cf818e", 20 | "metadata": { 21 | "tags": [] 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 26 | "\n", 27 | "spark" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "ebce8acc-912a-4e17-a0af-805dc7117570", 34 | "metadata": { 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "data = [(\"James\",\"\",\"Smith\",\"36636\",\"M\",3000),\n", 40 | " (\"Michael\",\"Rose\",\"\",\"40288\",\"M\",4000),\n", 41 | " (\"Robert\",\"\",\"Williams\",\"42114\",\"M\",4000),\n", 42 | " (\"Maria\",\"Anne\",\"Jones\",\"39192\",\"F\",4000),\n", 43 | " (\"Jen\",\"Mary\",\"Brown\",\"\",\"F\",-1)\n", 44 | " ]\n", 45 | "\n", 46 | "schema = StructType([ \\\n", 47 | " StructField(\"firstname\",StringType(),True), \\\n", 48 | " StructField(\"middlename\",StringType(),True), \\\n", 49 | " StructField(\"lastname\",StringType(),True), \\\n", 50 | " StructField(\"id\", StringType(), True), \\\n", 51 | " StructField(\"gender\", StringType(), True), \\\n", 52 | " StructField(\"salary\", IntegerType(), True) \\\n", 53 | " ])" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "6d06e12f-ec86-4935-a065-68901e98c8b9", 60 | "metadata": { 61 | "tags": [] 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "555b9977-8792-4476-8110-25f62bd981ff", 72 | "metadata": { 73 | "tags": [] 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "df = spark.createDataFrame(data=data, schema=schema)\n", 78 | "df.printSchema()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "56bce0e3-bbc1-4531-ad29-6d7a36eea73c", 85 | "metadata": { 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "df.writeTo(\"db.minio\").createOrReplace()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "b5d3c232-1ab7-46a8-9632-533ba7186510", 97 | "metadata": { 98 | "tags": [] 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "new_data = [(\"New James\",\"\",\"Smith\",\"36646\", \"M\", 50)]\n", 103 | "df = spark.createDataFrame(data=new_data, schema=schema)\n", 104 | "df.writeTo(\"db.minio\").append()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "6c07b0d7a33625fc", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "res = spark.sql(\"SELECT * FROM db.minio\")\n", 115 | "res.show()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "10bebd93fa7fc888", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "spark.sql(\"DELETE FROM db.minio WHERE salary < 0\")\n", 126 | "table = spark.table(\"db.minio\")\n", 127 | "table.show()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "a8e98cb72bb2e467", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "spark.sql(\"UPDATE db.minio set salary = 220 WHERE id == '36646'\")\n", 138 | "table = spark.table(\"db.minio\")\n", 139 | "table.show()" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "bb2e01b8-8761-4eab-a3a9-ecf5d03eae3f", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "spark.stop()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "0d388fe5-1f3c-46cd-8ff9-e9f933d5e304", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 3 (ipykernel)", 164 | "language": "python", 165 | "name": "python3" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 3 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython3", 177 | "version": "3.11.7" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 5 182 | } 183 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/add-modify-remove-data-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "694fd852-8c95-411f-864c-919941ba9d22", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark.sql import SparkSession\n", 13 | "from pyspark.sql.types import StructType,StructField, StringType, IntegerType" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "30542263-049f-49aa-9ca1-c821d5cf818e", 20 | "metadata": { 21 | "tags": [] 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 26 | "\n", 27 | "spark" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "ebce8acc-912a-4e17-a0af-805dc7117570", 34 | "metadata": { 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "data = [(\"James\",\"\",\"Smith\",\"36636\",\"M\",3000),\n", 40 | " (\"Michael\",\"Rose\",\"\",\"40288\",\"M\",4000),\n", 41 | " (\"Robert\",\"\",\"Williams\",\"42114\",\"M\",4000),\n", 42 | " (\"Maria\",\"Anne\",\"Jones\",\"39192\",\"F\",4000),\n", 43 | " (\"Jen\",\"Mary\",\"Brown\",\"\",\"F\",-1)\n", 44 | " ]\n", 45 | "\n", 46 | "schema = StructType([ \\\n", 47 | " StructField(\"firstname\",StringType(),True), \\\n", 48 | " StructField(\"middlename\",StringType(),True), \\\n", 49 | " StructField(\"lastname\",StringType(),True), \\\n", 50 | " StructField(\"id\", StringType(), True), \\\n", 51 | " StructField(\"gender\", StringType(), True), \\\n", 52 | " StructField(\"salary\", IntegerType(), True) \\\n", 53 | " ])" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "6d06e12f-ec86-4935-a065-68901e98c8b9", 60 | "metadata": { 61 | "tags": [] 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "555b9977-8792-4476-8110-25f62bd981ff", 72 | "metadata": { 73 | "tags": [] 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "df = spark.createDataFrame(data=data, schema=schema)\n", 78 | "df.printSchema()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "56bce0e3-bbc1-4531-ad29-6d7a36eea73c", 85 | "metadata": { 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "df.writeTo(\"db.minio\").createOrReplace()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "b5d3c232-1ab7-46a8-9632-533ba7186510", 97 | "metadata": { 98 | "tags": [] 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "new_data = [(\"New James\",\"\",\"Smith\",\"36646\", \"M\", 50)]\n", 103 | "df = spark.createDataFrame(data=new_data, schema=schema)\n", 104 | "df.writeTo(\"db.minio\").append()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "6c07b0d7a33625fc", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "res = spark.sql(\"SELECT * FROM db.minio\")\n", 115 | "res.show()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "10bebd93fa7fc888", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "spark.sql(\"DELETE FROM db.minio WHERE salary < 0\")\n", 126 | "table = spark.table(\"db.minio\")\n", 127 | "table.show()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "a8e98cb72bb2e467", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "spark.sql(\"UPDATE db.minio set salary = 220 WHERE id == '36646'\")\n", 138 | "table = spark.table(\"db.minio\")\n", 139 | "table.show()" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "bb2e01b8-8761-4eab-a3a9-ecf5d03eae3f", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "spark.stop()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "0d388fe5-1f3c-46cd-8ff9-e9f933d5e304", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 3 (ipykernel)", 164 | "language": "python", 165 | "name": "python3" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 3 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython3", 177 | "version": "3.11.7" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 5 182 | } 183 | -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/00002-9911da1b-4e8c-4216-96f8-df53466e5096.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "format-version" : 2, 3 | "table-uuid" : "0016734d-5067-4b3c-be6f-4ca824f900fc", 4 | "location" : "s3://iceberg-data/db/minio", 5 | "last-sequence-number" : 3, 6 | "last-updated-ms" : 1706520063793, 7 | "last-column-id" : 6, 8 | "current-schema-id" : 0, 9 | "schemas" : [ { 10 | "type" : "struct", 11 | "schema-id" : 0, 12 | "fields" : [ { 13 | "id" : 1, 14 | "name" : "firstname", 15 | "required" : false, 16 | "type" : "string" 17 | }, { 18 | "id" : 2, 19 | "name" : "middlename", 20 | "required" : false, 21 | "type" : "string" 22 | }, { 23 | "id" : 3, 24 | "name" : "lastname", 25 | "required" : false, 26 | "type" : "string" 27 | }, { 28 | "id" : 4, 29 | "name" : "id", 30 | "required" : false, 31 | "type" : "string" 32 | }, { 33 | "id" : 5, 34 | "name" : "gender", 35 | "required" : false, 36 | "type" : "string" 37 | }, { 38 | "id" : 6, 39 | "name" : "salary", 40 | "required" : false, 41 | "type" : "int" 42 | } ] 43 | } ], 44 | "default-spec-id" : 0, 45 | "partition-specs" : [ { 46 | "spec-id" : 0, 47 | "fields" : [ ] 48 | } ], 49 | "last-partition-id" : 999, 50 | "default-sort-order-id" : 0, 51 | "sort-orders" : [ { 52 | "order-id" : 0, 53 | "fields" : [ ] 54 | } ], 55 | "properties" : { 56 | "owner" : "root", 57 | "write.parquet.compression-codec" : "zstd" 58 | }, 59 | "current-snapshot-id" : 7825605915503001692, 60 | "refs" : { 61 | "main" : { 62 | "snapshot-id" : 7825605915503001692, 63 | "type" : "branch" 64 | } 65 | }, 66 | "snapshots" : [ { 67 | "sequence-number" : 1, 68 | "snapshot-id" : 7426647800932772370, 69 | "timestamp-ms" : 1706516546185, 70 | "summary" : { 71 | "operation" : "append", 72 | "spark.app.id" : "app-20240129082210-0000", 73 | "added-data-files" : "2", 74 | "added-records" : "5", 75 | "added-files-size" : "3365", 76 | "changed-partition-count" : "1", 77 | "total-records" : "5", 78 | "total-files-size" : "3365", 79 | "total-data-files" : "2", 80 | "total-delete-files" : "0", 81 | "total-position-deletes" : "0", 82 | "total-equality-deletes" : "0" 83 | }, 84 | "manifest-list" : "s3://iceberg-data/db/minio/metadata/snap-7426647800932772370-1-b10658da-8308-41c7-9209-217725fa8660.avro", 85 | "schema-id" : 0 86 | }, { 87 | "sequence-number" : 2, 88 | "snapshot-id" : 2032559078621466157, 89 | "parent-snapshot-id" : 7426647800932772370, 90 | "timestamp-ms" : 1706518166494, 91 | "summary" : { 92 | "operation" : "append", 93 | "spark.app.id" : "app-20240129082210-0000", 94 | "added-data-files" : "1", 95 | "added-records" : "1", 96 | "added-files-size" : "1636", 97 | "changed-partition-count" : "1", 98 | "total-records" : "6", 99 | "total-files-size" : "5001", 100 | "total-data-files" : "3", 101 | "total-delete-files" : "0", 102 | "total-position-deletes" : "0", 103 | "total-equality-deletes" : "0" 104 | }, 105 | "manifest-list" : "s3://iceberg-data/db/minio/metadata/snap-2032559078621466157-1-274b9f78-7835-494a-88e7-c4a7fbc87659.avro", 106 | "schema-id" : 0 107 | }, { 108 | "sequence-number" : 3, 109 | "snapshot-id" : 7825605915503001692, 110 | "parent-snapshot-id" : 2032559078621466157, 111 | "timestamp-ms" : 1706520063793, 112 | "summary" : { 113 | "operation" : "overwrite", 114 | "spark.app.id" : "app-20240129082210-0000", 115 | "added-data-files" : "1", 116 | "deleted-data-files" : "1", 117 | "added-records" : "2", 118 | "deleted-records" : "3", 119 | "added-files-size" : "1656", 120 | "removed-files-size" : "1724", 121 | "changed-partition-count" : "1", 122 | "total-records" : "5", 123 | "total-files-size" : "4933", 124 | "total-data-files" : "3", 125 | "total-delete-files" : "0", 126 | "total-position-deletes" : "0", 127 | "total-equality-deletes" : "0" 128 | }, 129 | "manifest-list" : "s3://iceberg-data/db/minio/metadata/snap-7825605915503001692-1-d4223bc8-a7bd-46b3-80f3-7434fb900e74.avro", 130 | "schema-id" : 0 131 | } ], 132 | "statistics" : [ ], 133 | "snapshot-log" : [ { 134 | "timestamp-ms" : 1706516546185, 135 | "snapshot-id" : 7426647800932772370 136 | }, { 137 | "timestamp-ms" : 1706518166494, 138 | "snapshot-id" : 2032559078621466157 139 | }, { 140 | "timestamp-ms" : 1706520063793, 141 | "snapshot-id" : 7825605915503001692 142 | } ], 143 | "metadata-log" : [ { 144 | "timestamp-ms" : 1706516546185, 145 | "metadata-file" : "s3://iceberg-data/db/minio/metadata/00000-f5b4c31a-cf6e-4723-a086-0c66c6a102f4.metadata.json" 146 | }, { 147 | "timestamp-ms" : 1706518166494, 148 | "metadata-file" : "s3://iceberg-data/db/minio/metadata/00001-4c5adbca-0447-491b-999c-50c28051f24d.metadata.json" 149 | } ] 150 | } -------------------------------------------------------------------------------- /medium-data/db/minio/metadata/00003-019ca905-74a2-40a8-a8f3-e123372820fa.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "format-version" : 2, 3 | "table-uuid" : "0016734d-5067-4b3c-be6f-4ca824f900fc", 4 | "location" : "s3://iceberg-data/db/minio", 5 | "last-sequence-number" : 4, 6 | "last-updated-ms" : 1706522607137, 7 | "last-column-id" : 6, 8 | "current-schema-id" : 0, 9 | "schemas" : [ { 10 | "type" : "struct", 11 | "schema-id" : 0, 12 | "fields" : [ { 13 | "id" : 1, 14 | "name" : "firstname", 15 | "required" : false, 16 | "type" : "string" 17 | }, { 18 | "id" : 2, 19 | "name" : "middlename", 20 | "required" : false, 21 | "type" : "string" 22 | }, { 23 | "id" : 3, 24 | "name" : "lastname", 25 | "required" : false, 26 | "type" : "string" 27 | }, { 28 | "id" : 4, 29 | "name" : "id", 30 | "required" : false, 31 | "type" : "string" 32 | }, { 33 | "id" : 5, 34 | "name" : "gender", 35 | "required" : false, 36 | "type" : "string" 37 | }, { 38 | "id" : 6, 39 | "name" : "salary", 40 | "required" : false, 41 | "type" : "int" 42 | } ] 43 | } ], 44 | "default-spec-id" : 0, 45 | "partition-specs" : [ { 46 | "spec-id" : 0, 47 | "fields" : [ ] 48 | } ], 49 | "last-partition-id" : 999, 50 | "default-sort-order-id" : 0, 51 | "sort-orders" : [ { 52 | "order-id" : 0, 53 | "fields" : [ ] 54 | } ], 55 | "properties" : { 56 | "owner" : "root", 57 | "write.parquet.compression-codec" : "zstd" 58 | }, 59 | "current-snapshot-id" : 739442481904053118, 60 | "refs" : { 61 | "main" : { 62 | "snapshot-id" : 739442481904053118, 63 | "type" : "branch" 64 | } 65 | }, 66 | "snapshots" : [ { 67 | "sequence-number" : 1, 68 | "snapshot-id" : 7426647800932772370, 69 | "timestamp-ms" : 1706516546185, 70 | "summary" : { 71 | "operation" : "append", 72 | "spark.app.id" : "app-20240129082210-0000", 73 | "added-data-files" : "2", 74 | "added-records" : "5", 75 | "added-files-size" : "3365", 76 | "changed-partition-count" : "1", 77 | "total-records" : "5", 78 | "total-files-size" : "3365", 79 | "total-data-files" : "2", 80 | "total-delete-files" : "0", 81 | "total-position-deletes" : "0", 82 | "total-equality-deletes" : "0" 83 | }, 84 | "manifest-list" : "s3://iceberg-data/db/minio/metadata/snap-7426647800932772370-1-b10658da-8308-41c7-9209-217725fa8660.avro", 85 | "schema-id" : 0 86 | }, { 87 | "sequence-number" : 2, 88 | "snapshot-id" : 2032559078621466157, 89 | "parent-snapshot-id" : 7426647800932772370, 90 | "timestamp-ms" : 1706518166494, 91 | "summary" : { 92 | "operation" : "append", 93 | "spark.app.id" : "app-20240129082210-0000", 94 | "added-data-files" : "1", 95 | "added-records" : "1", 96 | "added-files-size" : "1636", 97 | "changed-partition-count" : "1", 98 | "total-records" : "6", 99 | "total-files-size" : "5001", 100 | "total-data-files" : "3", 101 | "total-delete-files" : "0", 102 | "total-position-deletes" : "0", 103 | "total-equality-deletes" : "0" 104 | }, 105 | "manifest-list" : "s3://iceberg-data/db/minio/metadata/snap-2032559078621466157-1-274b9f78-7835-494a-88e7-c4a7fbc87659.avro", 106 | "schema-id" : 0 107 | }, { 108 | "sequence-number" : 3, 109 | "snapshot-id" : 7825605915503001692, 110 | "parent-snapshot-id" : 2032559078621466157, 111 | "timestamp-ms" : 1706520063793, 112 | "summary" : { 113 | "operation" : "overwrite", 114 | "spark.app.id" : "app-20240129082210-0000", 115 | "added-data-files" : "1", 116 | "deleted-data-files" : "1", 117 | "added-records" : "2", 118 | "deleted-records" : "3", 119 | "added-files-size" : "1656", 120 | "removed-files-size" : "1724", 121 | "changed-partition-count" : "1", 122 | "total-records" : "5", 123 | "total-files-size" : "4933", 124 | "total-data-files" : "3", 125 | "total-delete-files" : "0", 126 | "total-position-deletes" : "0", 127 | "total-equality-deletes" : "0" 128 | }, 129 | "manifest-list" : "s3://iceberg-data/db/minio/metadata/snap-7825605915503001692-1-d4223bc8-a7bd-46b3-80f3-7434fb900e74.avro", 130 | "schema-id" : 0 131 | }, { 132 | "sequence-number" : 4, 133 | "snapshot-id" : 739442481904053118, 134 | "parent-snapshot-id" : 7825605915503001692, 135 | "timestamp-ms" : 1706522607137, 136 | "summary" : { 137 | "operation" : "overwrite", 138 | "spark.app.id" : "app-20240129082210-0000", 139 | "added-data-files" : "1", 140 | "deleted-data-files" : "1", 141 | "added-records" : "1", 142 | "deleted-records" : "1", 143 | "added-files-size" : "1636", 144 | "removed-files-size" : "1636", 145 | "changed-partition-count" : "1", 146 | "total-records" : "5", 147 | "total-files-size" : "4933", 148 | "total-data-files" : "3", 149 | "total-delete-files" : "0", 150 | "total-position-deletes" : "0", 151 | "total-equality-deletes" : "0" 152 | }, 153 | "manifest-list" : "s3://iceberg-data/db/minio/metadata/snap-739442481904053118-1-c5934717-68b5-4262-8218-a2de395ba51e.avro", 154 | "schema-id" : 0 155 | } ], 156 | "statistics" : [ ], 157 | "snapshot-log" : [ { 158 | "timestamp-ms" : 1706516546185, 159 | "snapshot-id" : 7426647800932772370 160 | }, { 161 | "timestamp-ms" : 1706518166494, 162 | "snapshot-id" : 2032559078621466157 163 | }, { 164 | "timestamp-ms" : 1706520063793, 165 | "snapshot-id" : 7825605915503001692 166 | }, { 167 | "timestamp-ms" : 1706522607137, 168 | "snapshot-id" : 739442481904053118 169 | } ], 170 | "metadata-log" : [ { 171 | "timestamp-ms" : 1706516546185, 172 | "metadata-file" : "s3://iceberg-data/db/minio/metadata/00000-f5b4c31a-cf6e-4723-a086-0c66c6a102f4.metadata.json" 173 | }, { 174 | "timestamp-ms" : 1706518166494, 175 | "metadata-file" : "s3://iceberg-data/db/minio/metadata/00001-4c5adbca-0447-491b-999c-50c28051f24d.metadata.json" 176 | }, { 177 | "timestamp-ms" : 1706520063793, 178 | "metadata-file" : "s3://iceberg-data/db/minio/metadata/00002-9911da1b-4e8c-4216-96f8-df53466e5096.metadata.json" 179 | } ] 180 | } -------------------------------------------------------------------------------- /spark/requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | # SHA1:b2683bb70b5f87b364d74ec190cf2600ceb9a3a4 2 | # 3 | # This file is autogenerated by pip-compile-multi 4 | # To update, run: 5 | # 6 | # pip-compile-multi 7 | # 8 | annotated-types==0.6.0 9 | # via pydantic 10 | anyio==4.0.0 11 | # via jupyter-server 12 | appnope==0.1.3 13 | # via 14 | # ipykernel 15 | # ipython 16 | argon2-cffi==23.1.0 17 | # via jupyter-server 18 | argon2-cffi-bindings==21.2.0 19 | # via argon2-cffi 20 | arrow==1.3.0 21 | # via isoduration 22 | asttokens==2.4.1 23 | # via stack-data 24 | async-lru==2.0.4 25 | # via jupyterlab 26 | attrs==23.1.0 27 | # via 28 | # jsonschema 29 | # referencing 30 | babel==2.13.1 31 | # via jupyterlab-server 32 | beautifulsoup4==4.12.2 33 | # via nbconvert 34 | bleach==6.1.0 35 | # via nbconvert 36 | certifi==2023.7.22 37 | # via requests 38 | cffi==1.16.0 39 | # via argon2-cffi-bindings 40 | charset-normalizer==3.3.2 41 | # via requests 42 | click==8.1.7 43 | # via pyiceberg 44 | comm==0.2.0 45 | # via 46 | # ipykernel 47 | # ipywidgets 48 | debugpy==1.8.0 49 | # via ipykernel 50 | decorator==5.1.1 51 | # via ipython 52 | defusedxml==0.7.1 53 | # via nbconvert 54 | duckdb==0.9.1 55 | # via pyiceberg 56 | executing==2.0.1 57 | # via stack-data 58 | fastjsonschema==2.18.1 59 | # via nbformat 60 | fqdn==1.5.1 61 | # via jsonschema 62 | fsspec==2023.10.0 63 | # via pyiceberg 64 | idna==3.4 65 | # via 66 | # anyio 67 | # jsonschema 68 | # requests 69 | ipykernel==6.26.0 70 | # via 71 | # jupyter 72 | # jupyter-console 73 | # jupyterlab 74 | # qtconsole 75 | ipython==8.17.2 76 | # via 77 | # ipykernel 78 | # ipywidgets 79 | # jupyter-console 80 | ipywidgets==8.1.1 81 | # via jupyter 82 | isoduration==20.11.0 83 | # via jsonschema 84 | jedi==0.19.1 85 | # via ipython 86 | jinja2==3.1.2 87 | # via 88 | # jupyter-server 89 | # jupyterlab 90 | # jupyterlab-server 91 | # nbconvert 92 | json5==0.9.14 93 | # via jupyterlab-server 94 | jsonpointer==2.4 95 | # via jsonschema 96 | jsonschema[format-nongpl]==4.19.2 97 | # via 98 | # jupyter-events 99 | # jupyterlab-server 100 | # nbformat 101 | jsonschema-specifications==2023.7.1 102 | # via jsonschema 103 | jupyter==1.0.0 104 | # via -r spark/requirements/requirements.in 105 | jupyter-client==8.6.0 106 | # via 107 | # ipykernel 108 | # jupyter-console 109 | # jupyter-server 110 | # nbclient 111 | # qtconsole 112 | jupyter-console==6.6.3 113 | # via jupyter 114 | jupyter-core==5.5.0 115 | # via 116 | # ipykernel 117 | # jupyter-client 118 | # jupyter-console 119 | # jupyter-server 120 | # jupyterlab 121 | # nbclient 122 | # nbconvert 123 | # nbformat 124 | # qtconsole 125 | jupyter-events==0.9.0 126 | # via jupyter-server 127 | jupyter-lsp==2.2.0 128 | # via jupyterlab 129 | jupyter-server==2.10.0 130 | # via 131 | # jupyter-lsp 132 | # jupyterlab 133 | # jupyterlab-server 134 | # notebook 135 | # notebook-shim 136 | jupyter-server-terminals==0.4.4 137 | # via jupyter-server 138 | jupyterlab==4.0.8 139 | # via notebook 140 | jupyterlab-pygments==0.2.2 141 | # via nbconvert 142 | jupyterlab-server==2.25.1 143 | # via 144 | # jupyterlab 145 | # notebook 146 | jupyterlab-widgets==3.0.9 147 | # via ipywidgets 148 | markdown-it-py==3.0.0 149 | # via rich 150 | markupsafe==2.1.3 151 | # via 152 | # jinja2 153 | # nbconvert 154 | matplotlib-inline==0.1.6 155 | # via 156 | # ipykernel 157 | # ipython 158 | mdurl==0.1.2 159 | # via markdown-it-py 160 | mistune==3.0.2 161 | # via nbconvert 162 | mmhash3==3.0.1 163 | # via pyiceberg 164 | nbclient==0.9.0 165 | # via nbconvert 166 | nbconvert==7.11.0 167 | # via 168 | # jupyter 169 | # jupyter-server 170 | nbformat==5.9.2 171 | # via 172 | # jupyter-server 173 | # nbclient 174 | # nbconvert 175 | nest-asyncio==1.5.8 176 | # via ipykernel 177 | notebook==7.0.6 178 | # via jupyter 179 | notebook-shim==0.2.3 180 | # via 181 | # jupyterlab 182 | # notebook 183 | numpy==1.26.2 184 | # via 185 | # pandas 186 | # pyarrow 187 | overrides==7.4.0 188 | # via jupyter-server 189 | packaging==23.2 190 | # via 191 | # ipykernel 192 | # jupyter-server 193 | # jupyterlab 194 | # jupyterlab-server 195 | # nbconvert 196 | # qtconsole 197 | # qtpy 198 | pandas==2.1.3 199 | # via pyiceberg 200 | pandocfilters==1.5.0 201 | # via nbconvert 202 | parso==0.8.3 203 | # via jedi 204 | pexpect==4.8.0 205 | # via ipython 206 | platformdirs==4.0.0 207 | # via jupyter-core 208 | prometheus-client==0.18.0 209 | # via jupyter-server 210 | prompt-toolkit==3.0.40 211 | # via 212 | # ipython 213 | # jupyter-console 214 | psutil==5.9.6 215 | # via ipykernel 216 | ptyprocess==0.7.0 217 | # via 218 | # pexpect 219 | # terminado 220 | pure-eval==0.2.2 221 | # via stack-data 222 | pyarrow==13.0.0 223 | # via pyiceberg 224 | pycparser==2.21 225 | # via cffi 226 | pydantic==2.5.0 227 | # via pyiceberg 228 | pydantic-core==2.14.1 229 | # via pydantic 230 | pygments==2.16.1 231 | # via 232 | # ipython 233 | # jupyter-console 234 | # nbconvert 235 | # qtconsole 236 | # rich 237 | pyiceberg[duckdb,pandas,pyarrow]==0.5.1 238 | # via -r spark/requirements/requirements.in 239 | pyparsing==3.1.1 240 | # via pyiceberg 241 | python-dateutil==2.8.2 242 | # via 243 | # arrow 244 | # jupyter-client 245 | # pandas 246 | # strictyaml 247 | python-json-logger==2.0.7 248 | # via jupyter-events 249 | pytz==2023.3.post1 250 | # via pandas 251 | pyyaml==6.0.1 252 | # via jupyter-events 253 | pyzmq==25.1.1 254 | # via 255 | # ipykernel 256 | # jupyter-client 257 | # jupyter-console 258 | # jupyter-server 259 | # qtconsole 260 | qtconsole==5.5.0 261 | # via jupyter 262 | qtpy==2.4.1 263 | # via qtconsole 264 | referencing==0.30.2 265 | # via 266 | # jsonschema 267 | # jsonschema-specifications 268 | # jupyter-events 269 | requests==2.31.0 270 | # via 271 | # jupyterlab-server 272 | # pyiceberg 273 | rfc3339-validator==0.1.4 274 | # via 275 | # jsonschema 276 | # jupyter-events 277 | rfc3986-validator==0.1.1 278 | # via 279 | # jsonschema 280 | # jupyter-events 281 | rich==13.6.0 282 | # via pyiceberg 283 | rpds-py==0.12.0 284 | # via 285 | # jsonschema 286 | # referencing 287 | send2trash==1.8.2 288 | # via jupyter-server 289 | six==1.16.0 290 | # via 291 | # asttokens 292 | # bleach 293 | # python-dateutil 294 | # rfc3339-validator 295 | sniffio==1.3.0 296 | # via anyio 297 | sortedcontainers==2.4.0 298 | # via pyiceberg 299 | soupsieve==2.5 300 | # via beautifulsoup4 301 | stack-data==0.6.3 302 | # via ipython 303 | strictyaml==1.7.3 304 | # via pyiceberg 305 | terminado==0.18.0 306 | # via 307 | # jupyter-server 308 | # jupyter-server-terminals 309 | tinycss2==1.2.1 310 | # via nbconvert 311 | tornado==6.3.3 312 | # via 313 | # ipykernel 314 | # jupyter-client 315 | # jupyter-server 316 | # jupyterlab 317 | # notebook 318 | # terminado 319 | traitlets==5.13.0 320 | # via 321 | # comm 322 | # ipykernel 323 | # ipython 324 | # ipywidgets 325 | # jupyter-client 326 | # jupyter-console 327 | # jupyter-core 328 | # jupyter-events 329 | # jupyter-server 330 | # jupyterlab 331 | # matplotlib-inline 332 | # nbclient 333 | # nbconvert 334 | # nbformat 335 | # qtconsole 336 | types-python-dateutil==2.8.19.14 337 | # via arrow 338 | typing-extensions==4.8.0 339 | # via 340 | # pydantic 341 | # pydantic-core 342 | tzdata==2023.3 343 | # via pandas 344 | uri-template==1.3.0 345 | # via jsonschema 346 | urllib3==2.1.0 347 | # via requests 348 | wcwidth==0.2.9 349 | # via prompt-toolkit 350 | webcolors==1.13 351 | # via jsonschema 352 | webencodings==0.5.1 353 | # via 354 | # bleach 355 | # tinycss2 356 | websocket-client==1.6.4 357 | # via jupyter-server 358 | widgetsnbextension==4.0.9 359 | # via ipywidgets 360 | -------------------------------------------------------------------------------- /medium-data/transformed_avro_files/manifests/d4223bc8-a7bd-46b3-80f3-7434fb900e74-m0.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "status": 0, 4 | "snapshot_id": 7426647800932772370, 5 | "sequence_number": 1, 6 | "file_sequence_number": 1, 7 | "data_file": { 8 | "content": 0, 9 | "file_path": "s3://iceberg-data/db/minio/data/00000-0-9a99aa40-5d08-4d7e-850a-d307d23f5c0f-00001.parquet", 10 | "file_format": "PARQUET", 11 | "partition": {}, 12 | "record_count": 2, 13 | "file_size_in_bytes": 1641, 14 | "column_sizes": [ 15 | { 16 | "key": 1, 17 | "value": 58 18 | }, 19 | { 20 | "key": 2, 21 | "value": 50 22 | }, 23 | { 24 | "key": 3, 25 | "value": 51 26 | }, 27 | { 28 | "key": 4, 29 | "value": 56 30 | }, 31 | { 32 | "key": 5, 33 | "value": 73 34 | }, 35 | { 36 | "key": 6, 37 | "value": 46 38 | } 39 | ], 40 | "value_counts": [ 41 | { 42 | "key": 1, 43 | "value": 2 44 | }, 45 | { 46 | "key": 2, 47 | "value": 2 48 | }, 49 | { 50 | "key": 3, 51 | "value": 2 52 | }, 53 | { 54 | "key": 4, 55 | "value": 2 56 | }, 57 | { 58 | "key": 5, 59 | "value": 2 60 | }, 61 | { 62 | "key": 6, 63 | "value": 2 64 | } 65 | ], 66 | "null_value_counts": [ 67 | { 68 | "key": 1, 69 | "value": 0 70 | }, 71 | { 72 | "key": 2, 73 | "value": 0 74 | }, 75 | { 76 | "key": 3, 77 | "value": 0 78 | }, 79 | { 80 | "key": 4, 81 | "value": 0 82 | }, 83 | { 84 | "key": 5, 85 | "value": 0 86 | }, 87 | { 88 | "key": 6, 89 | "value": 0 90 | } 91 | ], 92 | "nan_value_counts": [], 93 | "lower_bounds": [ 94 | { 95 | "key": 1, 96 | "value": "James" 97 | }, 98 | { 99 | "key": 2, 100 | "value": "" 101 | }, 102 | { 103 | "key": 3, 104 | "value": "" 105 | }, 106 | { 107 | "key": 4, 108 | "value": "36636" 109 | }, 110 | { 111 | "key": 5, 112 | "value": "M" 113 | }, 114 | { 115 | "key": 6, 116 | "value": "\u00b8\u000b\u0000\u0000" 117 | } 118 | ], 119 | "upper_bounds": [ 120 | { 121 | "key": 1, 122 | "value": "Michael" 123 | }, 124 | { 125 | "key": 2, 126 | "value": "Rose" 127 | }, 128 | { 129 | "key": 3, 130 | "value": "Smith" 131 | }, 132 | { 133 | "key": 4, 134 | "value": "40288" 135 | }, 136 | { 137 | "key": 5, 138 | "value": "M" 139 | }, 140 | { 141 | "key": 6, 142 | "value": "\u00a0\u000f\u0000\u0000" 143 | } 144 | ], 145 | "key_metadata": null, 146 | "split_offsets": [ 147 | 4 148 | ], 149 | "equality_ids": null, 150 | "sort_order_id": 0 151 | } 152 | }, 153 | { 154 | "status": 2, 155 | "snapshot_id": 7825605915503001692, 156 | "sequence_number": 1, 157 | "file_sequence_number": 1, 158 | "data_file": { 159 | "content": 0, 160 | "file_path": "s3://iceberg-data/db/minio/data/00001-1-9a99aa40-5d08-4d7e-850a-d307d23f5c0f-00001.parquet", 161 | "file_format": "PARQUET", 162 | "partition": {}, 163 | "record_count": 3, 164 | "file_size_in_bytes": 1724, 165 | "column_sizes": [ 166 | { 167 | "key": 1, 168 | "value": 64 169 | }, 170 | { 171 | "key": 2, 172 | "value": 58 173 | }, 174 | { 175 | "key": 3, 176 | "value": 67 177 | }, 178 | { 179 | "key": 4, 180 | "value": 60 181 | }, 182 | { 183 | "key": 5, 184 | "value": 79 185 | }, 186 | { 187 | "key": 6, 188 | "value": 77 189 | } 190 | ], 191 | "value_counts": [ 192 | { 193 | "key": 1, 194 | "value": 3 195 | }, 196 | { 197 | "key": 2, 198 | "value": 3 199 | }, 200 | { 201 | "key": 3, 202 | "value": 3 203 | }, 204 | { 205 | "key": 4, 206 | "value": 3 207 | }, 208 | { 209 | "key": 5, 210 | "value": 3 211 | }, 212 | { 213 | "key": 6, 214 | "value": 3 215 | } 216 | ], 217 | "null_value_counts": [ 218 | { 219 | "key": 1, 220 | "value": 0 221 | }, 222 | { 223 | "key": 2, 224 | "value": 0 225 | }, 226 | { 227 | "key": 3, 228 | "value": 0 229 | }, 230 | { 231 | "key": 4, 232 | "value": 0 233 | }, 234 | { 235 | "key": 5, 236 | "value": 0 237 | }, 238 | { 239 | "key": 6, 240 | "value": 0 241 | } 242 | ], 243 | "nan_value_counts": [], 244 | "lower_bounds": [ 245 | { 246 | "key": 1, 247 | "value": "Jen" 248 | }, 249 | { 250 | "key": 2, 251 | "value": "" 252 | }, 253 | { 254 | "key": 3, 255 | "value": "Brown" 256 | }, 257 | { 258 | "key": 4, 259 | "value": "" 260 | }, 261 | { 262 | "key": 5, 263 | "value": "F" 264 | }, 265 | { 266 | "key": 6, 267 | "value": "\u00ff\u00ff\u00ff\u00ff" 268 | } 269 | ], 270 | "upper_bounds": [ 271 | { 272 | "key": 1, 273 | "value": "Robert" 274 | }, 275 | { 276 | "key": 2, 277 | "value": "Mary" 278 | }, 279 | { 280 | "key": 3, 281 | "value": "Williams" 282 | }, 283 | { 284 | "key": 4, 285 | "value": "42114" 286 | }, 287 | { 288 | "key": 5, 289 | "value": "M" 290 | }, 291 | { 292 | "key": 6, 293 | "value": "\u00a0\u000f\u0000\u0000" 294 | } 295 | ], 296 | "key_metadata": null, 297 | "split_offsets": [ 298 | 4 299 | ], 300 | "equality_ids": null, 301 | "sort_order_id": 0 302 | } 303 | } 304 | ] -------------------------------------------------------------------------------- /medium-data/transformed_avro_files/manifests/b10658da-8308-41c7-9209-217725fa8660-m0.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "status": 1, 4 | "snapshot_id": 7426647800932772370, 5 | "sequence_number": null, 6 | "file_sequence_number": null, 7 | "data_file": { 8 | "content": 0, 9 | "file_path": "s3://iceberg-data/db/minio/data/00000-0-9a99aa40-5d08-4d7e-850a-d307d23f5c0f-00001.parquet", 10 | "file_format": "PARQUET", 11 | "partition": {}, 12 | "record_count": 2, 13 | "file_size_in_bytes": 1641, 14 | "column_sizes": [ 15 | { 16 | "key": 1, 17 | "value": 58 18 | }, 19 | { 20 | "key": 2, 21 | "value": 50 22 | }, 23 | { 24 | "key": 3, 25 | "value": 51 26 | }, 27 | { 28 | "key": 4, 29 | "value": 56 30 | }, 31 | { 32 | "key": 5, 33 | "value": 73 34 | }, 35 | { 36 | "key": 6, 37 | "value": 46 38 | } 39 | ], 40 | "value_counts": [ 41 | { 42 | "key": 1, 43 | "value": 2 44 | }, 45 | { 46 | "key": 2, 47 | "value": 2 48 | }, 49 | { 50 | "key": 3, 51 | "value": 2 52 | }, 53 | { 54 | "key": 4, 55 | "value": 2 56 | }, 57 | { 58 | "key": 5, 59 | "value": 2 60 | }, 61 | { 62 | "key": 6, 63 | "value": 2 64 | } 65 | ], 66 | "null_value_counts": [ 67 | { 68 | "key": 1, 69 | "value": 0 70 | }, 71 | { 72 | "key": 2, 73 | "value": 0 74 | }, 75 | { 76 | "key": 3, 77 | "value": 0 78 | }, 79 | { 80 | "key": 4, 81 | "value": 0 82 | }, 83 | { 84 | "key": 5, 85 | "value": 0 86 | }, 87 | { 88 | "key": 6, 89 | "value": 0 90 | } 91 | ], 92 | "nan_value_counts": [], 93 | "lower_bounds": [ 94 | { 95 | "key": 1, 96 | "value": "James" 97 | }, 98 | { 99 | "key": 2, 100 | "value": "" 101 | }, 102 | { 103 | "key": 3, 104 | "value": "" 105 | }, 106 | { 107 | "key": 4, 108 | "value": "36636" 109 | }, 110 | { 111 | "key": 5, 112 | "value": "M" 113 | }, 114 | { 115 | "key": 6, 116 | "value": "\u00b8\u000b\u0000\u0000" 117 | } 118 | ], 119 | "upper_bounds": [ 120 | { 121 | "key": 1, 122 | "value": "Michael" 123 | }, 124 | { 125 | "key": 2, 126 | "value": "Rose" 127 | }, 128 | { 129 | "key": 3, 130 | "value": "Smith" 131 | }, 132 | { 133 | "key": 4, 134 | "value": "40288" 135 | }, 136 | { 137 | "key": 5, 138 | "value": "M" 139 | }, 140 | { 141 | "key": 6, 142 | "value": "\u00a0\u000f\u0000\u0000" 143 | } 144 | ], 145 | "key_metadata": null, 146 | "split_offsets": [ 147 | 4 148 | ], 149 | "equality_ids": null, 150 | "sort_order_id": 0 151 | } 152 | }, 153 | { 154 | "status": 1, 155 | "snapshot_id": 7426647800932772370, 156 | "sequence_number": null, 157 | "file_sequence_number": null, 158 | "data_file": { 159 | "content": 0, 160 | "file_path": "s3://iceberg-data/db/minio/data/00001-1-9a99aa40-5d08-4d7e-850a-d307d23f5c0f-00001.parquet", 161 | "file_format": "PARQUET", 162 | "partition": {}, 163 | "record_count": 3, 164 | "file_size_in_bytes": 1724, 165 | "column_sizes": [ 166 | { 167 | "key": 1, 168 | "value": 64 169 | }, 170 | { 171 | "key": 2, 172 | "value": 58 173 | }, 174 | { 175 | "key": 3, 176 | "value": 67 177 | }, 178 | { 179 | "key": 4, 180 | "value": 60 181 | }, 182 | { 183 | "key": 5, 184 | "value": 79 185 | }, 186 | { 187 | "key": 6, 188 | "value": 77 189 | } 190 | ], 191 | "value_counts": [ 192 | { 193 | "key": 1, 194 | "value": 3 195 | }, 196 | { 197 | "key": 2, 198 | "value": 3 199 | }, 200 | { 201 | "key": 3, 202 | "value": 3 203 | }, 204 | { 205 | "key": 4, 206 | "value": 3 207 | }, 208 | { 209 | "key": 5, 210 | "value": 3 211 | }, 212 | { 213 | "key": 6, 214 | "value": 3 215 | } 216 | ], 217 | "null_value_counts": [ 218 | { 219 | "key": 1, 220 | "value": 0 221 | }, 222 | { 223 | "key": 2, 224 | "value": 0 225 | }, 226 | { 227 | "key": 3, 228 | "value": 0 229 | }, 230 | { 231 | "key": 4, 232 | "value": 0 233 | }, 234 | { 235 | "key": 5, 236 | "value": 0 237 | }, 238 | { 239 | "key": 6, 240 | "value": 0 241 | } 242 | ], 243 | "nan_value_counts": [], 244 | "lower_bounds": [ 245 | { 246 | "key": 1, 247 | "value": "Jen" 248 | }, 249 | { 250 | "key": 2, 251 | "value": "" 252 | }, 253 | { 254 | "key": 3, 255 | "value": "Brown" 256 | }, 257 | { 258 | "key": 4, 259 | "value": "" 260 | }, 261 | { 262 | "key": 5, 263 | "value": "F" 264 | }, 265 | { 266 | "key": 6, 267 | "value": "\u00ff\u00ff\u00ff\u00ff" 268 | } 269 | ], 270 | "upper_bounds": [ 271 | { 272 | "key": 1, 273 | "value": "Robert" 274 | }, 275 | { 276 | "key": 2, 277 | "value": "Mary" 278 | }, 279 | { 280 | "key": 3, 281 | "value": "Williams" 282 | }, 283 | { 284 | "key": 4, 285 | "value": "42114" 286 | }, 287 | { 288 | "key": 5, 289 | "value": "M" 290 | }, 291 | { 292 | "key": 6, 293 | "value": "\u00a0\u000f\u0000\u0000" 294 | } 295 | ], 296 | "key_metadata": null, 297 | "split_offsets": [ 298 | 4 299 | ], 300 | "equality_ids": null, 301 | "sort_order_id": 0 302 | } 303 | } 304 | ] -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/postgres-metadata-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "694fd852-8c95-411f-864c-919941ba9d22", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark.sql import SparkSession\n", 13 | "from pyspark.sql.types import StructType,StructField, StringType, IntegerType" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "30542263-049f-49aa-9ca1-c821d5cf818e", 20 | "metadata": { 21 | "tags": [] 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 26 | "\n", 27 | "spark.stop()" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "7966e2ac-4acb-4fb4-9943-352fdeaf4b71", 34 | "metadata": { 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 40 | "\n", 41 | "spark" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "ebce8acc-912a-4e17-a0af-805dc7117570", 48 | "metadata": { 49 | "tags": [] 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "data = [(\"James\",\"\",\"Smith\",\"36636\",\"M\",3000),\n", 54 | " (\"Michael\",\"Rose\",\"\",\"40288\",\"M\",4000),\n", 55 | " (\"Robert\",\"\",\"Williams\",\"42114\",\"M\",4000),\n", 56 | " (\"Maria\",\"Anne\",\"Jones\",\"39192\",\"F\",4000),\n", 57 | " (\"Jen\",\"Mary\",\"Brown\",\"\",\"F\",-1)\n", 58 | " ]\n", 59 | "\n", 60 | "schema = StructType([ \\\n", 61 | " StructField(\"fn\",StringType(),True), \\\n", 62 | " StructField(\"mid\",StringType(),True), \\\n", 63 | " StructField(\"lastname\",StringType(),True), \\\n", 64 | " StructField(\"id\", StringType(), True), \\\n", 65 | " StructField(\"gender\", StringType(), True), \\\n", 66 | " StructField(\"salary\", IntegerType(), True) \\\n", 67 | " ])" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "6d06e12f-ec86-4935-a065-68901e98c8b9", 74 | "metadata": { 75 | "tags": [] 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "data" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "555b9977-8792-4476-8110-25f62bd981ff", 86 | "metadata": { 87 | "tags": [] 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "df = spark.createDataFrame(data=data, schema=schema)\n", 92 | "df.printSchema()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "56bce0e3-bbc1-4531-ad29-6d7a36eea73c", 99 | "metadata": { 100 | "tags": [] 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "df.writeTo(\"db.catalog_test\").createOrReplace()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "b5d3c232-1ab7-46a8-9632-533ba7186510", 111 | "metadata": { 112 | "tags": [] 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "res = spark.sql(\"SELECT * FROM db.catalog_test\")\n", 117 | "res.show()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "1fd4c30e-495a-4a02-8b1b-f843ad94e1fd", 124 | "metadata": { 125 | "tags": [] 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "spark.sql(\"ALTER TABLE db.catalog_test RENAME TO db.pg_catalog;\")" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "b299eb57-bd43-4e3f-8afa-9394fe08743b", 136 | "metadata": { 137 | "tags": [] 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "spark.sql(\"ALTER TABLE db.pg_catalog SET TBLPROPERTIES ('comment' = 'Learning Iceberg.')\")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "813ad05c-4b61-4852-8bcf-50c2a118febc", 148 | "metadata": { 149 | "tags": [] 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "spark.sql(\"ALTER TABLE db.pg_catalog RENAME COLUMN fn TO firstname\")\n", 154 | "spark.sql(\"ALTER TABLE db.pg_catalog RENAME COLUMN mid TO middlename\")" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "a7e1ac02-24bd-47e1-a286-cb66be81729f", 161 | "metadata": { 162 | "tags": [] 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "new_data = [(\"New James\",\"\",\"Smith\",\"36646\", \"M\", 50),\n", 167 | " ]\n", 168 | "\n", 169 | "schema = StructType([ \\\n", 170 | " StructField(\"firstname\",StringType(),True), \\\n", 171 | " StructField(\"middlename\",StringType(),True), \\\n", 172 | " StructField(\"lastname\",StringType(),True), \\\n", 173 | " StructField(\"id\", StringType(), True), \\\n", 174 | " StructField(\"gender\", StringType(), True), \\\n", 175 | " StructField(\"salary\", IntegerType(), True) \\\n", 176 | " ])\n", 177 | "\n", 178 | "df = spark.createDataFrame(data=new_data, schema=schema)\n", 179 | "df.writeTo(\"db.pg_metadata\").append()\n", 180 | "\n", 181 | "res = spark.sql(\"SELECT * FROM db.pg_metadata\")\n", 182 | "res.show()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "f320d9ae-22b0-4966-ad3c-0802100db3cf", 189 | "metadata": { 190 | "tags": [] 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "res = spark.sql(\"SELECT * FROM db.pg_metadata.snapshots\")\n", 195 | "res.show(truncate=False)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "id": "e81da59f-c77f-4d02-a52a-05c0a87f4483", 202 | "metadata": { 203 | "tags": [] 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "res = spark.sql(\"SELECT file_path FROM db.pg_metadata.files\")\n", 208 | "res.show(truncate=False)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "id": "986f3ae0-c4b6-4de5-80ab-44556c76f106", 215 | "metadata": { 216 | "tags": [] 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "res = spark.sql(\"SELECT content, path, added_data_files_count, added_snapshot_id FROM db.pg_metadata.manifests\")\n", 221 | "res.show(truncate=False)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "id": "7af64035-cee1-4003-a3bd-e1b6862ddace", 228 | "metadata": { 229 | "tags": [] 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "res = spark.sql(\"SELECT committed_at, manifest_list FROM db.pg_metadata.snapshots\")\n", 234 | "res.show(truncate=False)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "2f0fd290-4e2d-443c-814c-ea65b7e368f4", 241 | "metadata": { 242 | "tags": [] 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "res = spark.sql(\"SELECT * FROM db.pg_metadata.manifests\")\n", 247 | "res.show(truncate=False)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "id": "fb9bb08e-33fa-449b-8b61-83a84cf1e7f1", 254 | "metadata": { 255 | "tags": [] 256 | }, 257 | "outputs": [], 258 | "source": [ 259 | "spark.catalog" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "id": "97d02afa-a3a2-4553-8074-3514b3778c49", 266 | "metadata": { 267 | "tags": [] 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "spark.catalog.currentDatabase()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "id": "c0cf36c8-c320-4bfa-9e5b-80843814b03a", 278 | "metadata": { 279 | "tags": [] 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "spark.catalog.listTables()" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "id": "a0b91051-7a13-4fb2-9a94-ea83c43b3ee6", 290 | "metadata": { 291 | "tags": [] 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "spark.catalog.listDatabases()" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "id": "749ba2f8-f330-4d08-8d1d-c8f8882ba995", 302 | "metadata": { 303 | "tags": [] 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "spark.sql('show databases').show()" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "id": "536b3118-6b54-404a-9561-1d94b49dbede", 314 | "metadata": { 315 | "tags": [] 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "spark.catalog.currentDatabase()" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "id": "39b1de4a-9a3d-4cf7-a4b1-7c374f32b2d3", 326 | "metadata": { 327 | "tags": [] 328 | }, 329 | "outputs": [], 330 | "source": [ 331 | "spark.catalog.listTables('db')" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "id": "f031ddb2-0531-46c1-8b71-0c85b04bfc3b", 338 | "metadata": { 339 | "tags": [] 340 | }, 341 | "outputs": [], 342 | "source": [ 343 | "spark.sql('show tables from db').show()" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "id": "90c5bfe5-ad34-4d0d-b06c-1844711b9f44", 350 | "metadata": { 351 | "tags": [] 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "spark.catalog.listCatalog()" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "id": "2736df5e-f5fc-46c9-a99f-8068f78f7c20", 362 | "metadata": { 363 | "tags": [] 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "# spark.stop()" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "id": "e0d6e63f-3392-463b-845b-e96f339a7a14", 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "data = [(\"James\",\"\",\"Smith\",\"36636\",\"M\",3000),\n", 378 | " (\"Michael\",\"Rose\",\"\",\"40288\",\"M\",4000),\n", 379 | " (\"Robert\",\"\",\"Williams\",\"42114\",\"M\",4000),\n", 380 | " (\"Maria\",\"Anne\",\"Jones\",\"39192\",\"F\",4000),\n", 381 | " (\"Jen\",\"Mary\",\"Brown\",\"\",\"F\",-1)\n", 382 | " ]" 383 | ] 384 | } 385 | ], 386 | "metadata": { 387 | "kernelspec": { 388 | "display_name": "Python 3 (ipykernel)", 389 | "language": "python", 390 | "name": "python3" 391 | }, 392 | "language_info": { 393 | "codemirror_mode": { 394 | "name": "ipython", 395 | "version": 3 396 | }, 397 | "file_extension": ".py", 398 | "mimetype": "text/x-python", 399 | "name": "python", 400 | "nbconvert_exporter": "python", 401 | "pygments_lexer": "ipython3", 402 | "version": "3.10.11" 403 | } 404 | }, 405 | "nbformat": 4, 406 | "nbformat_minor": 5 407 | } 408 | -------------------------------------------------------------------------------- /notebooks/postgres-catalog.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "694fd852-8c95-411f-864c-919941ba9d22", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark.sql import SparkSession\n", 13 | "from pyspark.sql.types import StructType,StructField, StringType, IntegerType" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "30542263-049f-49aa-9ca1-c821d5cf818e", 20 | "metadata": { 21 | "tags": [] 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 26 | "\n", 27 | "spark.stop()" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "7966e2ac-4acb-4fb4-9943-352fdeaf4b71", 34 | "metadata": { 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 40 | "\n", 41 | "spark" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "ebce8acc-912a-4e17-a0af-805dc7117570", 48 | "metadata": { 49 | "tags": [] 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "data = [(\"James\",\"\",\"Smith\",\"36636\",\"M\",3000),\n", 54 | " (\"Michael\",\"Rose\",\"\",\"40288\",\"M\",4000),\n", 55 | " (\"Robert\",\"\",\"Williams\",\"42114\",\"M\",4000),\n", 56 | " (\"Maria\",\"Anne\",\"Jones\",\"39192\",\"F\",4000),\n", 57 | " (\"Jen\",\"Mary\",\"Brown\",\"\",\"F\",-1)\n", 58 | " ]\n", 59 | "\n", 60 | "schema = StructType([ \\\n", 61 | " StructField(\"fn\",StringType(),True), \\\n", 62 | " StructField(\"mid\",StringType(),True), \\\n", 63 | " StructField(\"lastname\",StringType(),True), \\\n", 64 | " StructField(\"id\", StringType(), True), \\\n", 65 | " StructField(\"gender\", StringType(), True), \\\n", 66 | " StructField(\"salary\", IntegerType(), True) \\\n", 67 | " ])" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "6d06e12f-ec86-4935-a065-68901e98c8b9", 74 | "metadata": { 75 | "tags": [] 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "data" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "555b9977-8792-4476-8110-25f62bd981ff", 86 | "metadata": { 87 | "tags": [] 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "df = spark.createDataFrame(data=data, schema=schema)\n", 92 | "df.printSchema()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "56bce0e3-bbc1-4531-ad29-6d7a36eea73c", 99 | "metadata": { 100 | "tags": [] 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "df.writeTo(\"db.catalog_test\").createOrReplace()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "b5d3c232-1ab7-46a8-9632-533ba7186510", 111 | "metadata": { 112 | "tags": [] 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "res = spark.sql(\"SELECT * FROM db.catalog_test\")\n", 117 | "res.show()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "1fd4c30e-495a-4a02-8b1b-f843ad94e1fd", 124 | "metadata": { 125 | "tags": [] 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "spark.sql(\"ALTER TABLE db.catalog_test RENAME TO db.pg_catalog;\")" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "b299eb57-bd43-4e3f-8afa-9394fe08743b", 136 | "metadata": { 137 | "tags": [] 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "spark.sql(\"ALTER TABLE db.pg_catalog SET TBLPROPERTIES ('comment' = 'Learning Iceberg.')\")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "813ad05c-4b61-4852-8bcf-50c2a118febc", 148 | "metadata": { 149 | "tags": [] 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "spark.sql(\"ALTER TABLE db.pg_catalog RENAME COLUMN fn TO firstname\")\n", 154 | "spark.sql(\"ALTER TABLE db.pg_catalog RENAME COLUMN mid TO middlename\")" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "47896cd7", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "new_data = [(\"New James\",\"\",\"Smith\",\"36646\", \"M\", 50),\n", 165 | " ]\n", 166 | "\n", 167 | "schema = StructType([ \\\n", 168 | " StructField(\"firstname\",StringType(),True), \\\n", 169 | " StructField(\"middlename\",StringType(),True), \\\n", 170 | " StructField(\"lastname\",StringType(),True), \\\n", 171 | " StructField(\"id\", StringType(), True), \\\n", 172 | " StructField(\"gender\", StringType(), True), \\\n", 173 | " StructField(\"salary\", IntegerType(), True) \\\n", 174 | " ])\n", 175 | "\n", 176 | "df = spark.createDataFrame(data=new_data, schema=schema)\n", 177 | "df.writeTo(\"db.pg_catalog\").append()\n", 178 | "\n", 179 | "res = spark.sql(\"SELECT * FROM db.pg_catalog\")\n", 180 | "res.show()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "id": "f30436a9", 187 | "metadata": { 188 | "tags": [] 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "res = spark.sql(\"SELECT made_current_at, snapshot_id, parent_id, is_current_ancestor FROM db.pg_catalog.history\")\n", 193 | "res.show(truncate=False)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "1d6feb77", 200 | "metadata": { 201 | "tags": [] 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "res = spark.sql(\"SELECT committed_at, snapshot_id, operation, manifest_list, summary FROM db.pg_catalog.snapshots\")\n", 206 | "res.show(truncate=False)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "id": "e81da59f-c77f-4d02-a52a-05c0a87f4483", 213 | "metadata": { 214 | "tags": [] 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "res = spark.sql(\"SELECT file_path, file_format, record_count FROM db.pg_catalog.files\")\n", 219 | "res.show(truncate=False)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "id": "986f3ae0-c4b6-4de5-80ab-44556c76f106", 226 | "metadata": { 227 | "tags": [] 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "res = spark.sql(\"SELECT length, path, added_data_files_count, added_snapshot_id FROM db.pg_catalog.manifests\")\n", 232 | "res.show(truncate=False)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "id": "2f0fd290-4e2d-443c-814c-ea65b7e368f4", 239 | "metadata": { 240 | "tags": [] 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "res = spark.sql(\"SELECT timestamp, file, latest_snapshot_id, latest_schema_id, latest_sequence_number FROM db.pg_catalog.metadata_log_entries\")\n", 245 | "res.show(truncate=False)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "id": "fb9bb08e-33fa-449b-8b61-83a84cf1e7f1", 252 | "metadata": { 253 | "tags": [] 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "spark.catalog" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "id": "97d02afa-a3a2-4553-8074-3514b3778c49", 264 | "metadata": { 265 | "tags": [] 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "spark.catalog.currentDatabase()" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "id": "c0cf36c8-c320-4bfa-9e5b-80843814b03a", 276 | "metadata": { 277 | "tags": [] 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "spark.catalog.listTables()" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "id": "a0b91051-7a13-4fb2-9a94-ea83c43b3ee6", 288 | "metadata": { 289 | "tags": [] 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "spark.catalog.listDatabases()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "id": "749ba2f8-f330-4d08-8d1d-c8f8882ba995", 300 | "metadata": { 301 | "tags": [] 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "spark.sql('show databases').show()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "id": "536b3118-6b54-404a-9561-1d94b49dbede", 312 | "metadata": { 313 | "tags": [] 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "spark.catalog.currentDatabase()" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "id": "39b1de4a-9a3d-4cf7-a4b1-7c374f32b2d3", 324 | "metadata": { 325 | "tags": [] 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "spark.catalog.listTables('db')" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "id": "f031ddb2-0531-46c1-8b71-0c85b04bfc3b", 336 | "metadata": { 337 | "tags": [] 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "spark.sql('show tables from db').show()" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "id": "90c5bfe5-ad34-4d0d-b06c-1844711b9f44", 348 | "metadata": { 349 | "tags": [] 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "spark.catalog.listCatalog()" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "id": "2736df5e-f5fc-46c9-a99f-8068f78f7c20", 360 | "metadata": { 361 | "tags": [] 362 | }, 363 | "outputs": [], 364 | "source": [ 365 | "spark.stop()" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "id": "e0d6e63f-3392-463b-845b-e96f339a7a14", 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "data = [(\"James\",\"\",\"Smith\",\"36636\",\"M\",3000),\n", 376 | " (\"Michael\",\"Rose\",\"\",\"40288\",\"M\",4000),\n", 377 | " (\"Robert\",\"\",\"Williams\",\"42114\",\"M\",4000),\n", 378 | " (\"Maria\",\"Anne\",\"Jones\",\"39192\",\"F\",4000),\n", 379 | " (\"Jen\",\"Mary\",\"Brown\",\"\",\"F\",-1)\n", 380 | " ]" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "id": "2617db61-6a7d-4e3d-8839-8a35a58becc0", 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [] 390 | } 391 | ], 392 | "metadata": { 393 | "kernelspec": { 394 | "display_name": "Python 3 (ipykernel)", 395 | "language": "python", 396 | "name": "python3" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.11.6" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 5 413 | } 414 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/postgres-catalog-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "694fd852-8c95-411f-864c-919941ba9d22", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark.sql import SparkSession\n", 13 | "from pyspark.sql.types import StructType,StructField, StringType, IntegerType" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "30542263-049f-49aa-9ca1-c821d5cf818e", 20 | "metadata": { 21 | "tags": [] 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 26 | "\n", 27 | "spark.stop()" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "7966e2ac-4acb-4fb4-9943-352fdeaf4b71", 34 | "metadata": { 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", 40 | "\n", 41 | "spark" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "ebce8acc-912a-4e17-a0af-805dc7117570", 48 | "metadata": { 49 | "tags": [] 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "data = [(\"James\",\"\",\"Smith\",\"36636\",\"M\",3000),\n", 54 | " (\"Michael\",\"Rose\",\"\",\"40288\",\"M\",4000),\n", 55 | " (\"Robert\",\"\",\"Williams\",\"42114\",\"M\",4000),\n", 56 | " (\"Maria\",\"Anne\",\"Jones\",\"39192\",\"F\",4000),\n", 57 | " (\"Jen\",\"Mary\",\"Brown\",\"\",\"F\",-1)\n", 58 | " ]\n", 59 | "\n", 60 | "schema = StructType([ \\\n", 61 | " StructField(\"fn\",StringType(),True), \\\n", 62 | " StructField(\"mid\",StringType(),True), \\\n", 63 | " StructField(\"lastname\",StringType(),True), \\\n", 64 | " StructField(\"id\", StringType(), True), \\\n", 65 | " StructField(\"gender\", StringType(), True), \\\n", 66 | " StructField(\"salary\", IntegerType(), True) \\\n", 67 | " ])" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "6d06e12f-ec86-4935-a065-68901e98c8b9", 74 | "metadata": { 75 | "tags": [] 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "data" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "555b9977-8792-4476-8110-25f62bd981ff", 86 | "metadata": { 87 | "tags": [] 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "df = spark.createDataFrame(data=data, schema=schema)\n", 92 | "df.printSchema()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "56bce0e3-bbc1-4531-ad29-6d7a36eea73c", 99 | "metadata": { 100 | "tags": [] 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "df.writeTo(\"db.catalog_test\").createOrReplace()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "b5d3c232-1ab7-46a8-9632-533ba7186510", 111 | "metadata": { 112 | "tags": [] 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "res = spark.sql(\"SELECT * FROM db.catalog_test\")\n", 117 | "res.show()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "1fd4c30e-495a-4a02-8b1b-f843ad94e1fd", 124 | "metadata": { 125 | "tags": [] 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "spark.sql(\"ALTER TABLE db.catalog_test RENAME TO db.pg_catalog;\")" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "b299eb57-bd43-4e3f-8afa-9394fe08743b", 136 | "metadata": { 137 | "tags": [] 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "spark.sql(\"ALTER TABLE db.pg_catalog SET TBLPROPERTIES ('comment' = 'Learning Iceberg.')\")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "813ad05c-4b61-4852-8bcf-50c2a118febc", 148 | "metadata": { 149 | "tags": [] 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "spark.sql(\"ALTER TABLE db.pg_catalog RENAME COLUMN fn TO firstname\")\n", 154 | "spark.sql(\"ALTER TABLE db.pg_catalog RENAME COLUMN mid TO middlename\")" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "47896cd7", 161 | "metadata": { 162 | "jupyter": { 163 | "outputs_hidden": false 164 | } 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "new_data = [(\"New James\",\"\",\"Smith\",\"36646\", \"M\", 50),\n", 169 | " ]\n", 170 | "\n", 171 | "schema = StructType([ \\\n", 172 | " StructField(\"firstname\",StringType(),True), \\\n", 173 | " StructField(\"middlename\",StringType(),True), \\\n", 174 | " StructField(\"lastname\",StringType(),True), \\\n", 175 | " StructField(\"id\", StringType(), True), \\\n", 176 | " StructField(\"gender\", StringType(), True), \\\n", 177 | " StructField(\"salary\", IntegerType(), True) \\\n", 178 | " ])\n", 179 | "\n", 180 | "df = spark.createDataFrame(data=new_data, schema=schema)\n", 181 | "df.writeTo(\"db.pg_catalog\").append()\n", 182 | "\n", 183 | "res = spark.sql(\"SELECT * FROM db.pg_catalog\")\n", 184 | "res.show()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "id": "baa9473b", 191 | "metadata": { 192 | "jupyter": { 193 | "outputs_hidden": false 194 | } 195 | }, 196 | "outputs": [], 197 | "source": [] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "id": "20771c6e", 203 | "metadata": { 204 | "jupyter": { 205 | "outputs_hidden": false 206 | } 207 | }, 208 | "outputs": [], 209 | "source": [] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "id": "4edc3667", 215 | "metadata": { 216 | "jupyter": { 217 | "outputs_hidden": false 218 | } 219 | }, 220 | "outputs": [], 221 | "source": [] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "id": "19649c80", 227 | "metadata": { 228 | "jupyter": { 229 | "outputs_hidden": false 230 | } 231 | }, 232 | "outputs": [], 233 | "source": [] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "id": "f30436a9", 239 | "metadata": { 240 | "jupyter": { 241 | "outputs_hidden": false 242 | }, 243 | "tags": [] 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "res = spark.sql(\"SELECT made_current_at, snapshot_id, parent_id, is_current_ancestor FROM db.pg_catalog.history\")\n", 248 | "res.show(truncate=False)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "id": "1d6feb77", 255 | "metadata": { 256 | "jupyter": { 257 | "outputs_hidden": false 258 | }, 259 | "tags": [] 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "res = spark.sql(\"SELECT committed_at, snapshot_id, operation, manifest_list, summary FROM db.pg_catalog.snapshots\")\n", 264 | "res.show(truncate=False)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "id": "e81da59f-c77f-4d02-a52a-05c0a87f4483", 271 | "metadata": { 272 | "tags": [] 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "res = spark.sql(\"SELECT file_path, file_format, record_count FROM db.pg_catalog.files\")\n", 277 | "res.show(truncate=False)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "id": "986f3ae0-c4b6-4de5-80ab-44556c76f106", 284 | "metadata": { 285 | "tags": [] 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "res = spark.sql(\"SELECT length, path, added_data_files_count, added_snapshot_id FROM db.pg_catalog.manifests\")\n", 290 | "res.show(truncate=False)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "id": "2f0fd290-4e2d-443c-814c-ea65b7e368f4", 297 | "metadata": { 298 | "tags": [] 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "res = spark.sql(\"SELECT timestamp, file, latest_snapshot_id, latest_schema_id, latest_sequence_number FROM db.pg_catalog.metadata_log_entries\")\n", 303 | "res.show(truncate=False)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "id": "fb9bb08e-33fa-449b-8b61-83a84cf1e7f1", 310 | "metadata": { 311 | "tags": [] 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "spark.catalog" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "id": "97d02afa-a3a2-4553-8074-3514b3778c49", 322 | "metadata": { 323 | "tags": [] 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "spark.catalog.currentDatabase()" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "id": "c0cf36c8-c320-4bfa-9e5b-80843814b03a", 334 | "metadata": { 335 | "tags": [] 336 | }, 337 | "outputs": [], 338 | "source": [ 339 | "spark.catalog.listTables()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "id": "a0b91051-7a13-4fb2-9a94-ea83c43b3ee6", 346 | "metadata": { 347 | "tags": [] 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "spark.catalog.listDatabases()" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "id": "749ba2f8-f330-4d08-8d1d-c8f8882ba995", 358 | "metadata": { 359 | "tags": [] 360 | }, 361 | "outputs": [], 362 | "source": [ 363 | "spark.sql('show databases').show()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "id": "536b3118-6b54-404a-9561-1d94b49dbede", 370 | "metadata": { 371 | "tags": [] 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "spark.catalog.currentDatabase()" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "id": "39b1de4a-9a3d-4cf7-a4b1-7c374f32b2d3", 382 | "metadata": { 383 | "tags": [] 384 | }, 385 | "outputs": [], 386 | "source": [ 387 | "spark.catalog.listTables('db')" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "id": "f031ddb2-0531-46c1-8b71-0c85b04bfc3b", 394 | "metadata": { 395 | "tags": [] 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "spark.sql('show tables from db').show()" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "id": "90c5bfe5-ad34-4d0d-b06c-1844711b9f44", 406 | "metadata": { 407 | "tags": [] 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "spark.catalog.listCatalog()" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "id": "2736df5e-f5fc-46c9-a99f-8068f78f7c20", 418 | "metadata": { 419 | "tags": [] 420 | }, 421 | "outputs": [], 422 | "source": [ 423 | "spark.stop()" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "id": "e0d6e63f-3392-463b-845b-e96f339a7a14", 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "data = [(\"James\",\"\",\"Smith\",\"36636\",\"M\",3000),\n", 434 | " (\"Michael\",\"Rose\",\"\",\"40288\",\"M\",4000),\n", 435 | " (\"Robert\",\"\",\"Williams\",\"42114\",\"M\",4000),\n", 436 | " (\"Maria\",\"Anne\",\"Jones\",\"39192\",\"F\",4000),\n", 437 | " (\"Jen\",\"Mary\",\"Brown\",\"\",\"F\",-1)\n", 438 | " ]" 439 | ] 440 | } 441 | ], 442 | "metadata": { 443 | "kernelspec": { 444 | "display_name": "Python 3 (ipykernel)", 445 | "language": "python", 446 | "name": "python3" 447 | }, 448 | "language_info": { 449 | "codemirror_mode": { 450 | "name": "ipython", 451 | "version": 3 452 | }, 453 | "file_extension": ".py", 454 | "mimetype": "text/x-python", 455 | "name": "python", 456 | "nbconvert_exporter": "python", 457 | "pygments_lexer": "ipython3", 458 | "version": "3.10.11" 459 | } 460 | }, 461 | "nbformat": 4, 462 | "nbformat_minor": 5 463 | } 464 | --------------------------------------------------------------------------------