├── .gitignore ├── bi_dashboards ├── docker-compose.yml └── readme.md ├── dremio-arrow-connection-code.md ├── dremio_api └── testing.md ├── dremio_arrow_python ├── .ipynb_checkpoints │ ├── connection-checkpoint.py │ └── dremio_to_notebook-checkpoint.ipynb ├── connection.py └── dremio_to_notebook.ipynb ├── intro_to_dbt ├── .gitignore ├── docker-compose.yml ├── dremio_tutorial │ ├── .gitignore │ ├── README.md │ ├── analyses │ │ └── .gitkeep │ ├── dbt_project.yml │ ├── macros │ │ └── .gitkeep │ ├── models │ │ ├── example │ │ │ ├── my_first_dbt_model.sql │ │ │ ├── my_second_dbt_model.sql │ │ │ └── schema.yml │ │ └── semanticlayer │ │ │ ├── business_tax_bronze.sql │ │ │ ├── business_tax_silver.sql │ │ │ ├── complex_tax_view_gold.sql │ │ │ ├── individual_tax_bronze.sql │ │ │ └── individual_tax_silver.sql │ ├── seeds │ │ └── .gitkeep │ ├── snapshots │ │ └── .gitkeep │ └── tests │ │ └── .gitkeep └── readme.md ├── intro_to_python ├── .ipynb_checkpoints │ ├── classes-checkpoint.ipynb │ ├── collections-checkpoint.ipynb │ ├── functions-checkpoint.ipynb │ ├── introtopython-checkpoint.ipynb │ ├── loops-checkpoint.ipynb │ ├── operators-checkpoint.ipynb │ └── readme-checkpoint.md ├── classes.ipynb ├── collections.ipynb ├── functions.ipynb ├── introtopython.ipynb ├── loops.ipynb ├── operators.ipynb ├── polars-exercises │ ├── .ipynb_checkpoints │ │ ├── charts-checkpoint.ipynb │ │ ├── director_dimension_table-checkpoint.csv │ │ ├── genre_dimension_table-checkpoint.csv │ │ ├── introtopolars-checkpoint.ipynb │ │ ├── movie_fact_table-checkpoint.csv │ │ ├── movies-checkpoint.ipynb │ │ ├── read_join_csv-checkpoint.ipynb │ │ └── write_csv-checkpoint.ipynb │ ├── charts.ipynb │ ├── director_dimension_table.csv │ ├── genre_dimension_table.csv │ ├── introtopolars.ipynb │ ├── movie_fact_table.csv │ ├── movies.ipynb │ ├── read_join_csv.ipynb │ └── write_csv.ipynb ├── readme.md └── scratch.py ├── intro_to_sql └── scripts.md ├── parquet └── weather_data.parquet ├── readme.md ├── spark_scripts ├── file_system_table.md ├── nessie_ingest.md ├── nessie_setup.md └── pyspark_basics.md └── sql_using_dremio ├── CASE.md ├── MEDIAN.md ├── MERGE_INTO.md └── STUDENTS.md /.gitignore: -------------------------------------------------------------------------------- 1 | *amtest* -------------------------------------------------------------------------------- /bi_dashboards/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | # Nessie Catalog Server Using In-Memory Store 5 | catalog: 6 | image: projectnessie/nessie:0.76.0 7 | container_name: catalog 8 | networks: 9 | dremio-laptop-lakehouse: 10 | ports: 11 | - 19120:19120 12 | # Superset for Building BI Dashboard 13 | dashboards: 14 | image: alexmerced/dremio-superset 15 | container_name: dashboards 16 | networks: 17 | dremio-laptop-lakehouse: 18 | ports: 19 | - 8080:8088 20 | # Minio Storage Server 21 | storage: 22 | image: minio/minio:RELEASE.2024-01-01T16-36-33Z 23 | container_name: storage 24 | environment: 25 | - MINIO_ROOT_USER=admin 26 | - MINIO_ROOT_PASSWORD=password 27 | - MINIO_DOMAIN=storage 28 | - MINIO_REGION_NAME=us-east-1 29 | - MINIO_REGION=us-east-1 30 | networks: 31 | dremio-laptop-lakehouse: 32 | ports: 33 | - 9001:9001 34 | - 9000:9000 35 | command: ["server", "/data", "--console-address", ":9001"] 36 | # Dremio 37 | dremio: 38 | platform: linux/x86_64 39 | image: dremio/dremio-oss:latest 40 | ports: 41 | - 9047:9047 42 | - 31010:31010 43 | - 32010:32010 44 | container_name: dremio 45 | networks: 46 | dremio-laptop-lakehouse: 47 | networks: 48 | dremio-laptop-lakehouse: -------------------------------------------------------------------------------- /bi_dashboards/readme.md: -------------------------------------------------------------------------------- 1 | [Details on Setup](https://github.com/developer-advocacy-dremio/quick-guides-from-dremio/blob/main/guides/superset-dremio.md) 2 | 3 | ```sql 4 | CREATE TABLE orders ( 5 | order_id INT, 6 | order_date DATE, 7 | customer_id INT, 8 | customer_name VARCHAR, 9 | product_id INT, 10 | product_name VARCHAR, 11 | quantity INT, 12 | unit_price DECIMAL(10,2), 13 | total_price DECIMAL(10, 2), 14 | order_status VARCHAR, 15 | payment_method VARCHAR, 16 | shipping_country VARCHAR, 17 | shipping_cost DECIMAL(10,2) 18 | ); 19 | ``` 20 | 21 | ```sql 22 | INSERT INTO orders 23 | (order_id, order_date, customer_id, customer_name, product_id, product_name, quantity, unit_price, total_price, order_status, payment_method, shipping_country, shipping_cost) 24 | VALUES 25 | (1, '2023-11-20', 123, 'Alice Smith', 543, 'Wireless Headphones', 1, 79.99, 79.99, 'Delivered', 'Credit Card', 'USA', 5.99), 26 | (2, '2023-12-05', 456, 'Bob Johnson', 129, 'Running Shoes', 2, 89.99, 179.98, 'Shipped', 'PayPal', 'Germany', 9.99), 27 | (3, '2023-12-12', 789, 'Charlie Lee', 875, 'Smart Watch', 1, 199.99, 199.99, 'Pending', 'Debit Card', 'China', 12.99), 28 | (4, '2023-12-18', 123, 'Alice Smith', 231, 'Coffee Maker', 1, 49.99, 49.99, 'Processed', 'Credit Card', 'USA', 4.99), 29 | (5, '2023-12-21', 951, 'David Brown', 654, 'Bluetooth Speaker', 3, 39.99, 119.97, 'Delivered', 'Cash on Delivery', 'UK', 7.99), 30 | (6, '2024-01-03', 456, 'Bob Johnson', 369, 'Fitness Tracker', 2, 59.99, 119.98, 'Shipped', 'PayPal', 'Germany', 9.99), 31 | (7, '2024-01-08', 258, 'Emily Clark', 987, 'Laptop Bag', 1, 29.99, 29.99, 'Pending', 'Debit Card', 'Canada', 8.99), 32 | (8, '2024-01-10', 789, 'Charlie Lee', 543, 'Wireless Headphones', 2, 79.99, 159.98, 'Processed', 'Credit Card', 'China', 12.99), 33 | (9, '2024-01-15', 123, 'Alice Smith', 102, 'Book: The Martian', 1, 15.99, 15.99, 'Delivered', 'Credit Card', 'USA', 3.99), 34 | (10, '2024-01-16', 951, 'David Brown', 459, 'Wireless Keyboard', 1, 34.99, 34.99, 'Shipped', 'PayPal', 'UK', 6.99); 35 | ``` 36 | -------------------------------------------------------------------------------- /dremio-arrow-connection-code.md: -------------------------------------------------------------------------------- 1 | ## Dremio Arrow Client 2 | 3 | A library already exists on pypl called `dremio-simple-query` which you can find the source code here: 4 | 5 | - [dremio-simple-query](https://github.com/developer-advocacy-dremio/dremio_simple_query/blob/main/src/dremio_simple_query/connect.py) 6 | 7 | ## Modified For Polars 8 | 9 | This library isn't in the conda repository, so you can just use this modified version below for use with Polars. Make sure Polars and Pyarrow are in your environment for the below to work. 10 | 11 | ``` 12 | conda install pyarrow polars 13 | ``` 14 | 15 | ``` 16 | pip install pyarrow polars 17 | ``` 18 | 19 | ```py 20 | #---------------------------------- 21 | # IMPORTS 22 | #---------------------------------- 23 | ## Import Pyarrow 24 | from pyarrow import flight 25 | from pyarrow.flight import FlightClient 26 | import polars as pl 27 | 28 | class DremioConnection: 29 | 30 | def __init__(self, token, location): 31 | self.token = token 32 | self.location = location 33 | self.headers = [ 34 | (b"authorization", f"bearer {token}".encode("utf-8")) 35 | ] 36 | self.client = FlightClient(location=(location)) 37 | 38 | def query(self, query, client, headers): 39 | ## Options for Query 40 | options = flight.FlightCallOptions(headers=headers) 41 | 42 | ## Get ticket to for query execution, used to get results 43 | flight_info = client.get_flight_info(flight.FlightDescriptor.for_command(query), options) 44 | 45 | ## Get Results (Return Value a FlightStreamReader) 46 | results = client.do_get(flight_info.endpoints[0].ticket, options) 47 | return results 48 | 49 | # Returns a FlightStreamReader 50 | def toArrow(self, query): 51 | return self.query(query, self.client, self.headers) 52 | 53 | #Returns a DuckDB Relation 54 | def toPolars(self, querystring): 55 | streamReader = self.query(querystring, self.client, self.headers) 56 | table = streamReader.read_all() 57 | df = pl.from_arrow(table) 58 | return df 59 | ``` -------------------------------------------------------------------------------- /dremio_api/testing.md: -------------------------------------------------------------------------------- 1 | ### Getting an Auth Token 2 | 3 | ```shell 4 | curl -X POST 'http://localhost:9047/apiv2/login' \ 5 | --header 'Content-Type: application/json' \ 6 | --data-raw '{ 7 | "userName": "username", 8 | "password": "password" 9 | }' 10 | ``` 11 | 12 | example response: 13 | 14 | ```json 15 | { 16 | "token": "o37sfbrkvuva2nlc3tnc50il72", 17 | "userName": "alexmerced", 18 | "firstName": "Alex", 19 | "lastName": "Merced", 20 | "expires": 1703549412550, 21 | "email": "data@alexmerced.com", 22 | "userId": "4181f938-4a7c-4e4a-9e52-cf98bc0450dc", 23 | "admin": true, 24 | "clusterId": "30f59341-f8d6-413d-aa40-d83616b2cf23", 25 | "clusterCreatedAt": 1703369496672, 26 | "version": "24.2.6-202311250456170399-68acbe47", 27 | "permissions": { 28 | "canUploadProfiles": true, 29 | "canDownloadProfiles": true, 30 | "canEmailForSupport": true, 31 | "canChatForSupport": false, 32 | "canViewAllJobs": true, 33 | "canCreateUser": true, 34 | "canCreateRole": true, 35 | "canCreateSource": true, 36 | "canUploadFile": true, 37 | "canManageNodeActivity": true, 38 | "canManageEngines": true, 39 | "canManageQueues": true, 40 | "canManageEngineRouting": true, 41 | "canManageSupportSettings": true 42 | }, 43 | "userCreatedAt": 1703369927940 44 | } 45 | ``` 46 | 47 | -------------------------------------------------------------------------------- /dremio_arrow_python/.ipynb_checkpoints/connection-checkpoint.py: -------------------------------------------------------------------------------- 1 | #---------------------------------- 2 | # IMPORTS 3 | #---------------------------------- 4 | ## Import Pyarrow 5 | from pyarrow import flight 6 | from pyarrow.flight import FlightClient 7 | import polars as pl 8 | 9 | class DremioConnection: 10 | 11 | def __init__(self, token, location): 12 | self.token = token 13 | self.location = location 14 | self.headers = [ 15 | (b"authorization", f"bearer {token}".encode("utf-8")) 16 | ] 17 | self.client = FlightClient(location=(location)) 18 | 19 | def query(self, query, client, headers): 20 | ## Options for Query 21 | options = flight.FlightCallOptions(headers=headers) 22 | 23 | ## Get ticket to for query execution, used to get results 24 | flight_info = client.get_flight_info(flight.FlightDescriptor.for_command(query), options) 25 | 26 | ## Get Results (Return Value a FlightStreamReader) 27 | results = client.do_get(flight_info.endpoints[0].ticket, options) 28 | return results 29 | 30 | # Returns a FlightStreamReader 31 | def toArrow(self, query): 32 | return self.query(query, self.client, self.headers) 33 | 34 | #Returns a DuckDB Relation 35 | def toPolars(self, querystring): 36 | streamReader = self.query(querystring, self.client, self.headers) 37 | table = streamReader.read_all() 38 | df = pl.from_arrow(table) 39 | return df -------------------------------------------------------------------------------- /dremio_arrow_python/.ipynb_checkpoints/dremio_to_notebook-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /dremio_arrow_python/connection.py: -------------------------------------------------------------------------------- 1 | #---------------------------------- 2 | # IMPORTS 3 | #---------------------------------- 4 | ## Import Pyarrow 5 | from pyarrow import flight 6 | from pyarrow.flight import FlightClient 7 | import polars as pl 8 | 9 | class DremioConnection: 10 | 11 | def __init__(self, token, location): 12 | self.token = token 13 | self.location = location 14 | self.headers = [ 15 | (b"authorization", f"bearer {token}".encode("utf-8")) 16 | ] 17 | self.client = FlightClient(location=(location)) 18 | 19 | def query(self, query, client, headers): 20 | ## Options for Query 21 | options = flight.FlightCallOptions(headers=headers) 22 | 23 | ## Get ticket to for query execution, used to get results 24 | flight_info = client.get_flight_info(flight.FlightDescriptor.for_command(query), options) 25 | 26 | ## Get Results (Return Value a FlightStreamReader) 27 | results = client.do_get(flight_info.endpoints[0].ticket, options) 28 | return results 29 | 30 | # Returns a FlightStreamReader 31 | def toArrow(self, query): 32 | return self.query(query, self.client, self.headers) 33 | 34 | #Returns a Polars Dataframe 35 | def toPolars(self, querystring): 36 | streamReader = self.query(querystring, self.client, self.headers) 37 | table = streamReader.read_all() 38 | df = pl.from_arrow(table) 39 | return df -------------------------------------------------------------------------------- /dremio_arrow_python/dremio_to_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 13, 6 | "id": "0358dce6-ab7c-4777-92a3-49c7ec667111", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from connection import DremioConnection\n", 11 | "from amtest_configs import config\n", 12 | "import seaborn as sns\n", 13 | "import polars as pl\n", 14 | "\n", 15 | "token = config['token']\n", 16 | "url = 'grpc+tls://data.dremio.cloud:443'" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 16, 22 | "id": "1b53c93e-8d6a-48ae-a184-7633ab676c09", 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "shape: (1_522, 6)\n", 30 | "┌─────────────────────────┬───────────┬──────────┬──────────┬───────────┬───────────┐\n", 31 | "│ date ┆ awnd ┆ prcp ┆ snow ┆ tempmax ┆ tempmin │\n", 32 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 33 | "│ datetime[ms] ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 34 | "╞═════════════════════════╪═══════════╪══════════╪══════════╪═══════════╪═══════════╡\n", 35 | "│ 2013-02-09 20:47:12.054 ┆ 11.613907 ┆ 1.711757 ┆ 0.134577 ┆ 67.161372 ┆ 21.216398 │\n", 36 | "│ 2014-09-29 03:12:41.036 ┆ 2.951145 ┆ 1.711623 ┆ 0.075035 ┆ 53.120871 ┆ 26.465267 │\n", 37 | "│ 2014-11-26 22:29:17.629 ┆ 2.014154 ┆ 1.747144 ┆ 0.042906 ┆ 54.024588 ┆ 30.548978 │\n", 38 | "│ 2014-06-19 11:40:01.293 ┆ 4.876666 ┆ 1.735431 ┆ 0.282389 ┆ 48.426897 ┆ 37.014052 │\n", 39 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", 40 | "│ 2014-06-09 19:39:11.242 ┆ 11.207169 ┆ 1.156399 ┆ 0.041152 ┆ 96.580112 ┆ 16.005481 │\n", 41 | "│ 2013-01-24 23:53:10.793 ┆ 8.863728 ┆ 1.906099 ┆ 0.065432 ┆ 40.398773 ┆ 30.559263 │\n", 42 | "│ 2013-01-07 07:52:10.866 ┆ 1.433106 ┆ 1.885225 ┆ 0.278207 ┆ 64.194509 ┆ 19.758959 │\n", 43 | "│ 2013-07-11 18:06:19.968 ┆ 8.112909 ┆ 1.933141 ┆ 0.286537 ┆ 53.364714 ┆ 31.51478 │\n", 44 | "└─────────────────────────┴───────────┴──────────┴──────────┴───────────┴───────────┘\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "## Establish Connection\n", 50 | "dremio = DremioConnection(token, url)\n", 51 | "\n", 52 | "## Run Query, Get Back Polars DF\n", 53 | "df = dremio.toPolars('SELECT * FROM dremiocloud101.\"Weather_PRCP_Over1\"')\n", 54 | "\n", 55 | "## Print df\n", 56 | "print(df)\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 17, 62 | "id": "6caf33f3-a871-4dbc-8eea-a7d042618aaf", 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "[Text(0.5, 1.0, 'Precipitation by Date')]" 69 | ] 70 | }, 71 | "execution_count": 17, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | }, 75 | { 76 | "data": { 77 | "image/png": "", 78 | "text/plain": [ 79 | "
" 80 | ] 81 | }, 82 | "metadata": {}, 83 | "output_type": "display_data" 84 | } 85 | ], 86 | "source": [ 87 | "sns.lineplot(data=df, x=\"date\", y=\"prcp\").set(title=\"Precipitation by Date\")" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "id": "ecc72f88-ea3b-4bfa-9992-453c5b390473", 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [] 97 | } 98 | ], 99 | "metadata": { 100 | "kernelspec": { 101 | "display_name": "Python 3 (ipykernel)", 102 | "language": "python", 103 | "name": "python3" 104 | }, 105 | "language_info": { 106 | "codemirror_mode": { 107 | "name": "ipython", 108 | "version": 3 109 | }, 110 | "file_extension": ".py", 111 | "mimetype": "text/x-python", 112 | "name": "python", 113 | "nbconvert_exporter": "python", 114 | "pygments_lexer": "ipython3", 115 | "version": "3.11.5" 116 | } 117 | }, 118 | "nbformat": 4, 119 | "nbformat_minor": 5 120 | } 121 | -------------------------------------------------------------------------------- /intro_to_dbt/.gitignore: -------------------------------------------------------------------------------- 1 | /venv -------------------------------------------------------------------------------- /intro_to_dbt/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | # Nessie Catalog Server Using In-Memory Store 5 | catalog: 6 | image: projectnessie/nessie:0.76.0 7 | container_name: catalog 8 | networks: 9 | dremio-laptop-lakehouse: 10 | ports: 11 | - 19120:19120 12 | # Minio Storage Server 13 | storage: 14 | image: minio/minio:RELEASE.2024-01-01T16-36-33Z 15 | container_name: storage 16 | environment: 17 | - MINIO_ROOT_USER=admin 18 | - MINIO_ROOT_PASSWORD=password 19 | - MINIO_DOMAIN=storage 20 | - MINIO_REGION_NAME=us-east-1 21 | - MINIO_REGION=us-east-1 22 | networks: 23 | dremio-laptop-lakehouse: 24 | ports: 25 | - 9001:9001 26 | - 9000:9000 27 | command: ["server", "/data", "--console-address", ":9001"] 28 | # Dremio 29 | dremio: 30 | platform: linux/x86_64 31 | image: dremio/dremio-oss:latest 32 | ports: 33 | - 9047:9047 34 | - 31010:31010 35 | - 32010:32010 36 | container_name: dremio 37 | networks: 38 | dremio-laptop-lakehouse: 39 | networks: 40 | dremio-laptop-lakehouse: -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/analyses/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexMercedCoder/understanding_data_with_alex_merced/981d77b2eebffcc33ef5aa60c4512bb311bb4dfe/intro_to_dbt/dremio_tutorial/analyses/.gitkeep -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'dremio_tutorial' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'dremio_tutorial' 11 | 12 | # These configurations specify where dbt should look for different types of files. 13 | # The `model-paths` config, for example, states that models in this project can be 14 | # found in the "models/" directory. You probably won't need to change these! 15 | model-paths: ["models"] 16 | analysis-paths: ["analyses"] 17 | test-paths: ["tests"] 18 | seed-paths: ["seeds"] 19 | macro-paths: ["macros"] 20 | snapshot-paths: ["snapshots"] 21 | 22 | clean-targets: # directories to be removed by `dbt clean` 23 | - "target" 24 | - "dbt_packages" 25 | 26 | 27 | # Configuring models 28 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 29 | 30 | # In this example config, we tell dbt to build all models in the example/ 31 | # directory as views. These settings can be overridden in the individual model 32 | # files using the `{{ config(...) }}` macro. 33 | models: 34 | dremio_tutorial: 35 | # Config indicated by + and applies to all files under models/example/ 36 | example: 37 | +materialized: view 38 | semanticlayer: 39 | +materialized: view 40 | -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexMercedCoder/understanding_data_with_alex_merced/981d77b2eebffcc33ef5aa60c4512bb311bb4dfe/intro_to_dbt/dremio_tutorial/macros/.gitkeep -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/models/example/my_first_dbt_model.sql: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | Welcome to your first dbt model! 4 | Did you know that you can also configure models directly within SQL files? 5 | This will override configurations stated in dbt_project.yml 6 | 7 | Try changing "table" to "view" below 8 | */ 9 | 10 | {{config(database='dbt_practice2', schema='output.test1')}} 11 | 12 | with source_data as ( 13 | 14 | select 1 as id 15 | union all 16 | select null as id 17 | 18 | ) 19 | 20 | select * 21 | from source_data 22 | 23 | /* 24 | Uncomment the line below to remove records with null `id` values 25 | */ 26 | 27 | -- where id is not null 28 | -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/models/example/my_second_dbt_model.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Use the `ref` function to select from other models 3 | 4 | select * 5 | from {{ ref('my_first_dbt_model') }} 6 | where id = 1 7 | -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/models/example/schema.yml: -------------------------------------------------------------------------------- 1 | 2 | version: 2 3 | 4 | models: 5 | - name: my_first_dbt_model 6 | description: "A starter dbt model" 7 | columns: 8 | - name: id 9 | description: "The primary key for this table" 10 | tests: 11 | - unique 12 | - not_null 13 | 14 | - name: my_second_dbt_model 15 | description: "A starter dbt model" 16 | columns: 17 | - name: id 18 | description: "The primary key for this table" 19 | tests: 20 | - unique 21 | - not_null 22 | -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/models/semanticlayer/business_tax_bronze.sql: -------------------------------------------------------------------------------- 1 | {{ config(database='tax_collections', schema='bronze')}} 2 | 3 | SELECT * from warehouse.output."business_tax" -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/models/semanticlayer/business_tax_silver.sql: -------------------------------------------------------------------------------- 1 | {{ config(database='tax_collections', schema='silver')}} 2 | 3 | SELECT 4 | business_id, 5 | business_name, 6 | COALESCE(revenue, 0) AS revenue, -- Replacing NULL revenue with 0 7 | GREATEST(tax_paid, 0) AS tax_paid -- Correcting negative tax_paid 8 | FROM {{ ref('business_tax_bronze') }} -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/models/semanticlayer/complex_tax_view_gold.sql: -------------------------------------------------------------------------------- 1 | {{ config(database='tax_collections', schema='gold')}} 2 | 3 | SELECT 4 | 'Individual' AS taxpayer_type, 5 | taxpayer_id AS id, 6 | full_name AS name, 7 | income, 8 | tax_paid 9 | FROM {{ ref('individual_tax_silver') }} 10 | UNION ALL 11 | SELECT 12 | 'Business' AS taxpayer_type, 13 | business_id AS id, 14 | business_name AS name, 15 | revenue AS income, 16 | tax_paid 17 | FROM {{ ref('business_tax_silver') }} -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/models/semanticlayer/individual_tax_bronze.sql: -------------------------------------------------------------------------------- 1 | {{ config(database='tax_collections', schema='bronze')}} 2 | 3 | SELECT * from warehouse.output."individual_tax" -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/models/semanticlayer/individual_tax_silver.sql: -------------------------------------------------------------------------------- 1 | {{ config(database='tax_collections', schema='silver')}} 2 | 3 | SELECT 4 | taxpayer_id, 5 | full_name, 6 | COALESCE(income, 0) AS income, -- Replacing NULL income with 0 7 | GREATEST(tax_paid, 0) AS tax_paid -- Correcting negative tax_paid 8 | FROM {{ ref('individual_tax_bronze') }} -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/seeds/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexMercedCoder/understanding_data_with_alex_merced/981d77b2eebffcc33ef5aa60c4512bb311bb4dfe/intro_to_dbt/dremio_tutorial/seeds/.gitkeep -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexMercedCoder/understanding_data_with_alex_merced/981d77b2eebffcc33ef5aa60c4512bb311bb4dfe/intro_to_dbt/dremio_tutorial/snapshots/.gitkeep -------------------------------------------------------------------------------- /intro_to_dbt/dremio_tutorial/tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexMercedCoder/understanding_data_with_alex_merced/981d77b2eebffcc33ef5aa60c4512bb311bb4dfe/intro_to_dbt/dremio_tutorial/tests/.gitkeep -------------------------------------------------------------------------------- /intro_to_dbt/readme.md: -------------------------------------------------------------------------------- 1 | ## DBT - Data Build Tool 2 | 3 | [More dbt-dremio reference](https://github.com/developer-advocacy-dremio/quick-guides-from-dremio/blob/main/guides/dbt.md) 4 | 5 | - Setup Dremio Locally on our Laptop 6 | - Configure our DBT Profile 7 | 8 | ## Setup Dremio 9 | 10 | - Create a `docker-compose.yml` 11 | 12 | ```yml 13 | version: "3" 14 | 15 | services: 16 | # Nessie Catalog Server Using In-Memory Store 17 | catalog: 18 | image: projectnessie/nessie:0.76.0 19 | container_name: catalog 20 | networks: 21 | dremio-laptop-lakehouse: 22 | ports: 23 | - 19120:19120 24 | # Minio Storage Server 25 | storage: 26 | image: minio/minio:RELEASE.2024-01-01T16-36-33Z 27 | container_name: storage 28 | environment: 29 | - MINIO_ROOT_USER=admin 30 | - MINIO_ROOT_PASSWORD=password 31 | - MINIO_DOMAIN=storage 32 | - MINIO_REGION_NAME=us-east-1 33 | - MINIO_REGION=us-east-1 34 | networks: 35 | dremio-laptop-lakehouse: 36 | ports: 37 | - 9001:9001 38 | - 9000:9000 39 | command: ["server", "/data", "--console-address", ":9001"] 40 | # Dremio 41 | dremio: 42 | platform: linux/x86_64 43 | image: dremio/dremio-oss:latest 44 | ports: 45 | - 9047:9047 46 | - 31010:31010 47 | - 32010:32010 48 | container_name: dremio 49 | networks: 50 | dremio-laptop-lakehouse: 51 | networks: 52 | dremio-laptop-lakehouse: 53 | 54 | ``` 55 | 56 | - [Directions for Dremio Setup](https://github.com/developer-advocacy-dremio/quick-guides-from-dremio/blob/main/guides/nessie_dremio.md) 57 | 58 | ## Setup Python Environment, install dbt 59 | 60 | - `python -m venv venv` 61 | 62 | - `source ./venv/bin/activate` 63 | 64 | - `pip install dbt-dremio` 65 | 66 | ## Create a dbt project 67 | 68 | - `dbt init ` 69 | 70 | - select dremio 71 | 72 | - select dremio with software username/password 73 | 74 | - put `127.0.0.1` as host 75 | 76 | - use `9047` as port 77 | 78 | - put username and password 79 | 80 | - use the name of a nessie/metastore/object storage source for "object_storage_soure" 81 | 82 | - write a path to a subfolder in that source for "object_storage_path" 83 | 84 | - write the name of a space for "dremio_space" 85 | 86 | - write the path to a subfolder in your space for "dremio_space_path" 87 | 88 | - select 1 thread 89 | 90 | ## dbt function 91 | 92 | **{{ config() }}** 93 | 94 | Configures the behavior for the following model. 95 | 96 | Example Arguments: 97 | 98 | - `materialized`: `view` to create a sql view or `table` to create a table 99 | 100 | - `database`: The dremio space (view) or source (table) to create the result in 101 | 102 | - `schema`: the path to a subfolder in the source to out the results. 103 | 104 | **{{ ref() }}** 105 | 106 | Reference to a source model. This ensure that the referenced model will be run before this model. 107 | 108 | ### Example SQL 109 | 110 | ```sql 111 | -- Creating bronze tables for local tax data 112 | -- Bronze Table 1: Individual Tax Records 113 | CREATE TABLE individual_tax ( 114 | taxpayer_id INT, 115 | full_name VARCHAR, 116 | income FLOAT, 117 | tax_paid FLOAT 118 | ); 119 | 120 | -- Bronze Table 2: Business Tax Records 121 | CREATE TABLE business_tax ( 122 | business_id INT, 123 | business_name VARCHAR, 124 | revenue FLOAT, 125 | tax_paid FLOAT 126 | ); 127 | 128 | -- Inserting flawed data into bronze tables 129 | -- Inserting data into Individual Tax Records 130 | INSERT INTO individual_tax (taxpayer_id, full_name, income, tax_paid) VALUES 131 | (1, 'John Doe', 50000, 5000), 132 | (2, 'Jane Smith', NULL, 4500), -- Missing income 133 | (3, 'Alice Johnson', 70000, -700); -- Negative tax paid (flawed) 134 | 135 | -- Inserting data into Business Tax Records 136 | INSERT INTO business_tax (business_id, business_name, revenue, tax_paid) VALUES 137 | (101, 'ABC Corp', 200000, 20000), 138 | (102, 'XYZ Inc', NULL, 18000), -- Missing revenue 139 | (103, 'Acme LLC', 150000, -1500); -- Negative tax paid (flawed) 140 | 141 | -- Creating silver views to clean up the data 142 | -- Silver View 1: Cleaned Individual Tax Records 143 | CREATE VIEW individual_tax AS 144 | SELECT 145 | taxpayer_id, 146 | full_name, 147 | COALESCE(income, 0) AS income, -- Replacing NULL income with 0 148 | GREATEST(tax_paid, 0) AS tax_paid -- Correcting negative tax_paid 149 | FROM arctic."Tax Collections".bronze.individual_tax; 150 | 151 | -- Silver View 2: Cleaned Business Tax Records 152 | CREATE VIEW business_tax AS 153 | SELECT 154 | business_id, 155 | business_name, 156 | COALESCE(revenue, 0) AS revenue, -- Replacing NULL revenue with 0 157 | GREATEST(tax_paid, 0) AS tax_paid -- Correcting negative tax_paid 158 | FROM arctic."Tax Collections".bronze.business_tax; 159 | 160 | -- Creating a gold view: Consolidated Tax Records 161 | CREATE VIEW tax_records AS 162 | SELECT 163 | 'Individual' AS taxpayer_type, 164 | taxpayer_id AS id, 165 | full_name AS name, 166 | income, 167 | tax_paid 168 | FROM individual_tax 169 | UNION ALL 170 | SELECT 171 | 'Business' AS taxpayer_type, 172 | business_id AS id, 173 | business_name AS name, 174 | revenue AS income, 175 | tax_paid 176 | FROM business_tax; 177 | ``` -------------------------------------------------------------------------------- /intro_to_python/.ipynb_checkpoints/classes-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /intro_to_python/.ipynb_checkpoints/collections-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /intro_to_python/.ipynb_checkpoints/functions-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /intro_to_python/.ipynb_checkpoints/introtopython-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /intro_to_python/.ipynb_checkpoints/loops-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /intro_to_python/.ipynb_checkpoints/operators-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /intro_to_python/.ipynb_checkpoints/readme-checkpoint.md: -------------------------------------------------------------------------------- 1 | ## Create a Virtual Environment and Run Jupyterlab 2 | 3 | - Must have Python and Anaconda Installed 4 | 5 | ### With Anaconda 6 | 7 | **Create Environment** 8 | 9 | initialize conda if isn't already 10 | 11 | ```bash 12 | ### For Bash 13 | conda init bash 14 | ### For zsh 15 | conda init zsh 16 | ``` 17 | 18 | create an environment 19 | ``` 20 | conda create -n myenv python=3.11 21 | ``` 22 | 23 | activate the environment 24 | ``` 25 | conda activate myenv 26 | ``` 27 | 28 | To later deactivate an environment 29 | ``` 30 | conda deactivate 31 | ``` 32 | 33 | ** Run Notebook ** 34 | 35 | install dependencies 36 | ```bash 37 | conda install jupyter jupyterlab 38 | ``` 39 | 40 | run jupyter lab 41 | ```bash 42 | jupyter lab 43 | ``` 44 | 45 | ### With Pip 46 | 47 | create an environment 48 | ```bash 49 | python -m venv venv 50 | ``` 51 | 52 | activate environment 53 | ```bash 54 | source ./venv/bin/activate 55 | ``` 56 | 57 | install jupyter and jupyterlab 58 | ```bash 59 | pip install jupyter jupyterlab 60 | ``` 61 | 62 | run jupyterlab 63 | ``` 64 | jupyter lab 65 | ``` 66 | 67 | ## Installing Polars and Seaborn 68 | 69 | - turn of jupyter lab by hitting `ctrl + c` in your terminal 70 | 71 | - add conda forge to your conda install 72 | 73 | ```bash 74 | conda config --add channels conda-forge 75 | ``` 76 | 77 | - install libraries 78 | 79 | ```bash 80 | conda install polars seaborn 81 | ``` -------------------------------------------------------------------------------- /intro_to_python/classes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "id": "11f53bea-efd9-41a9-a3a0-cff5b33d7a71", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "class Dog:\n", 11 | "\n", 12 | " def __init__(self, id, age, name):\n", 13 | " self.id = id\n", 14 | " self.age = age\n", 15 | " self.name = name\n", 16 | "\n", 17 | " def print_status(self):\n", 18 | " print(f\"{self.name}'s age is {self.age}\")\n", 19 | " " 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 5, 25 | "id": "7236d371-d183-405f-9858-6638c84f0f56", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "<__main__.Dog object at 0x7bc5b9271ed0>\n", 33 | "Sparky's age is 6\n", 34 | "6\n", 35 | "Sparky\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "sparky = Dog(1, 6, \"Sparky\")\n", 41 | "print(sparky)\n", 42 | "sparky.print_status()\n", 43 | "print(sparky.age)\n", 44 | "print(sparky.name)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 11, 50 | "id": "10840c1f-8919-4df6-aec3-a7616088cdcd", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "from dataclasses import dataclass\n", 55 | "\n", 56 | "@dataclass\n", 57 | "class DogData:\n", 58 | " id: int\n", 59 | " age: int\n", 60 | " name: str\n", 61 | "\n", 62 | " def print_status(self):\n", 63 | " print(f\"{self.name}'s age is {self.age}\")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 12, 69 | "id": "27e1d06c-6a87-47d4-abd0-2d5d40ad014b", 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "DogData(id=2, age=5, name='Sparky')\n", 77 | "5\n", 78 | "Sparky\n", 79 | "Sparky's age is 5\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "spot = DogData(2, 5, \"Sparky\")\n", 85 | "print(spot)\n", 86 | "print(spot.age)\n", 87 | "print(spot.name)\n", 88 | "spot.print_status()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "321ad5fa-529a-49d4-a7df-25edd84590f3", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "Python 3 (ipykernel)", 103 | "language": "python", 104 | "name": "python3" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.11.5" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 5 121 | } 122 | -------------------------------------------------------------------------------- /intro_to_python/collections.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "id": "3476b551-4182-49bf-8a35-b048b6f2023a", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "[1, 2, 3, 4, 5, 6, 7, 8, 9]\n", 14 | "3\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "## Collection, is a way to store multiple pieces of data into one variable\n", 20 | "\n", 21 | "## List - an ordered list of items that can be changed\n", 22 | "my_list = [1,2,3,4,5,6,7,8,9]\n", 23 | "\n", 24 | "print(my_list) #print full list\n", 25 | "print(my_list[2]) # Print out the 3rd (zero based indexing)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 6, 31 | "id": "5c4e2c0f-b1ac-403f-b51f-373f8dd2f551", 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "my_list.append(10)\n", 44 | "print(my_list)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 7, 50 | "id": "95b9576b-7787-4ae8-be3f-344c2ea18079", 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "6" 57 | ] 58 | }, 59 | "execution_count": 7, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "my_list.pop(5)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 8, 71 | "id": "a185d41d-781b-419f-9a6d-a20413530d94", 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "[1, 2, 3, 4, 5, 7, 8, 9, 10]\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "print(my_list)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 9, 89 | "id": "99048418-678a-4f6b-995a-e1ff4c2952f3", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "## Dictionary - Key/Value Pairs" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 10, 99 | "id": "3d5e89fa-6e52-49ea-9728-d070e23f1d83", 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "{'name': 'Alex Merced', 'age': 38}\n", 107 | "Alex Merced\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "my_dictionary = {\"name\" : \"Alex Merced\", \"age\": 38}\n", 113 | "print(my_dictionary) # Printing the entire dictionary\n", 114 | "print(my_dictionary[\"name\"]) # Printing one value from the dictionary" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 11, 120 | "id": "9413f6d2-9f89-4c2a-bde3-1b86e918f71b", 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "(1, 2, 3)\n", 128 | "1\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "## Tuple (List of data that never changes)\n", 134 | "my_tuple = (1,2,3)\n", 135 | "print(my_tuple)\n", 136 | "print(my_tuple[0])" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 14, 142 | "id": "0fc1ff25-bc8e-4742-845c-f0f9af0e7762", 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "{1, 2, 3, 4, 5, 6, 7}\n", 150 | "2\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "## Sets (list of items with no duplicates)\n", 156 | "my_set = {1,2,3,4,5,6,6,6,6,6,6,7,7,7,7,7,7}\n", 157 | "print(my_set)\n", 158 | "print(list(my_set)[1])" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "id": "4ffce25e-7e78-4c62-a0c3-4a347bf1f8bd", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3 (ipykernel)", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.11.5" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 5 191 | } 192 | -------------------------------------------------------------------------------- /intro_to_python/functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "a3e705cc-c7d0-4977-8fff-28eb0269ad47", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "## Function - Attaching a Block of Code to a Name\n", 11 | "## Run that code anytime for as many times as you want" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 5, 17 | "id": "afdde894-f8fa-47ba-ad80-83d2bba4255f", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "## Defining the Function\n", 22 | "def my_func(message):\n", 23 | " print(\"================================================\")\n", 24 | " print(message)\n", 25 | " print(\"================================================\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 6, 31 | "id": "23aed727-05d7-40a3-83fe-7576053dca3c", 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "================================================\n", 39 | "cheesE\n", 40 | "================================================\n", 41 | "================================================\n", 42 | "bread\n", 43 | "================================================\n", 44 | "================================================\n", 45 | "wine\n", 46 | "================================================\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "## Invoking the function (using the function)\n", 52 | "my_func(\"cheesE\")\n", 53 | "my_func(\"bread\")\n", 54 | "my_func(\"wine\")" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 7, 60 | "id": "78089159-e4c5-4fc8-be87-fb4165323667", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "def print_values(x,y):\n", 65 | " print(\"the value of x is\", x)\n", 66 | " print(\"the value of y is\", y)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 8, 72 | "id": "902f2a21-3627-45c2-9a52-d3a9f7aa4622", 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "the value of x is 4\n", 80 | "the value of y is 5\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "print_values(4,5) ## arguments fill parameters in order" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 10, 91 | "id": "24319f47-cf0f-4c6f-be3d-866e46ef87e2", 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "the value of x is 5\n", 99 | "the value of y is 4\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "print_values(y=4,x=5)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 12, 110 | "id": "173387be-e1c4-4e7c-8bc0-eb928cf12dd3", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "## Lamdas, quick oneline functions\n", 115 | "add = lambda x,y:x+y" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 13, 121 | "id": "6e7142b5-7c9a-40ad-ab0b-9bb64f3d0841", 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "4\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "print(add(2,2))" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 15, 139 | "id": "2379d463-bfe2-4c35-aa7f-28fc7ac92c70", 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "[2, 4, 6, 8, 10]\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "my_list = [1,2,3,4,5]\n", 152 | "new_list = list(map(lambda item:item*2, my_list))\n", 153 | "print(new_list)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 18, 159 | "id": "36eb822f-aa8f-42cb-ab53-349b7866001a", 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "(2, 4)\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "filtered_list = tuple(filter(lambda item:item % 2 == 0, my_list))\n", 172 | "print(filtered_list)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "id": "ccb46ad8-427a-45ee-8ec6-003deb165091", 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [] 182 | } 183 | ], 184 | "metadata": { 185 | "kernelspec": { 186 | "display_name": "Python 3 (ipykernel)", 187 | "language": "python", 188 | "name": "python3" 189 | }, 190 | "language_info": { 191 | "codemirror_mode": { 192 | "name": "ipython", 193 | "version": 3 194 | }, 195 | "file_extension": ".py", 196 | "mimetype": "text/x-python", 197 | "name": "python", 198 | "nbconvert_exporter": "python", 199 | "pygments_lexer": "ipython3", 200 | "version": "3.11.5" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 5 205 | } 206 | -------------------------------------------------------------------------------- /intro_to_python/introtopython.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "id": "bb60e1da-c5e9-4587-9399-ecc173449a46", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "Hello World 2\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "print(\"Hello World 2\")" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 5, 24 | "id": "3472f55f-cdca-456c-99db-ccfa0fc70d9b", 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "this is all the output\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "# Using the # I can write comments, these lines are ignored by the python interpreter\n", 37 | "print(\"this is all the output\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 7, 43 | "id": "adf30483-b440-4e34-b23d-65b5a63099c3", 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "2\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "this_is_my_variable = 2\n", 56 | "print(this_is_my_variable)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 8, 62 | "id": "e5ff026d-ceba-46bd-8cd7-64a414d8efcf", 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "3\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "this_is_my_variable = 3\n", 75 | "print(this_is_my_variable)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 9, 81 | "id": "409a5c0e-cbca-4a1e-be61-8da15ba425d3", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "my_string = \"Hello\"" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 10, 91 | "id": "3b2b803e-7af6-4e9b-980c-adad15bf24d8", 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "Hello\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "print(my_string)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 13, 109 | "id": "d7a2e89c-90a5-4c20-a86b-88dd25aed945", 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "didn't match any of the numbers\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "my_num = 6\n", 122 | "if (my_num 8):\n", 123 | " print(\"yes, it is equal\")\n", 124 | "elif (my_num == 7):\n", 125 | " print(\"yes, it is seven\")\n", 126 | "else:\n", 127 | " print(\"didn't match any of the numbers\")" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "30e1afc0-7cdc-4157-bc6d-4b02031ddc19", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [] 137 | } 138 | ], 139 | "metadata": { 140 | "kernelspec": { 141 | "display_name": "Python 3 (ipykernel)", 142 | "language": "python", 143 | "name": "python3" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": { 147 | "name": "ipython", 148 | "version": 3 149 | }, 150 | "file_extension": ".py", 151 | "mimetype": "text/x-python", 152 | "name": "python", 153 | "nbconvert_exporter": "python", 154 | "pygments_lexer": "ipython3", 155 | "version": "3.11.5" 156 | } 157 | }, 158 | "nbformat": 4, 159 | "nbformat_minor": 5 160 | } 161 | -------------------------------------------------------------------------------- /intro_to_python/loops.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "c418b133-4a7a-4428-a4fd-87f86429588a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "## LOOP and Iteration" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "35528ce4-e688-4693-bd34-eb348bd63bd2", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "## Doing things through repetitive" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "id": "90ae40a0-921b-4938-b678-cd7c4216889f", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "0\n", 34 | "1\n", 35 | "2\n", 36 | "3\n", 37 | "4\n", 38 | "5\n", 39 | "6\n", 40 | "7\n", 41 | "8\n", 42 | "9\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "## While\n", 48 | "## While the condition is true\n", 49 | "## The Code block repeats\n", 50 | "counter = 0\n", 51 | "while(counter < 10):\n", 52 | " print(counter)\n", 53 | " counter += 1" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "id": "7361e455-1de2-43ef-a63a-af215f97a9cd", 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "1\n", 67 | "2\n", 68 | "3\n", 69 | "4\n", 70 | "5\n", 71 | "6\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "my_list = [1,2,3,4,5,6]\n", 77 | "for num in my_list:\n", 78 | " print(num)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 5, 84 | "id": "508a9cc3-d7a0-4918-88cf-7ae9832c8507", 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "1\n", 92 | "2\n", 93 | "3\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "for num in (1,2,3):\n", 99 | " print(num)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 6, 105 | "id": "65464936-5be2-4501-bf74-892e1c43eb18", 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "1\n", 113 | "2\n", 114 | "3\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "for num in {1,2,3}:\n", 120 | " print(num)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 9, 126 | "id": "e6e56b0d-9b30-4afe-879e-fd9098149b18", 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "alex merced\n", 134 | "38\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "my_dict = {\"name\":\"alex merced\", \"age\": 38}\n", 140 | "for k in my_dict:\n", 141 | " print(my_dict[k])" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 13, 147 | "id": "3577b6b4-c3c3-4176-aa55-8ffd39e43eac", 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "dict_items([('name', 'alex merced'), ('age', 38)])\n", 155 | "my name equals alex merced\n", 156 | "my age equals 38\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "print(my_dict.items())\n", 162 | "for k,v in my_dict.items():\n", 163 | " print(f\"my {k} equals {v}\")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 14, 169 | "id": "2f0aae34-4c75-464a-8282-33b962b5a872", 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "1 2 3\n", 177 | "4 5 6\n", 178 | "7 8 9\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "list_tup = [(1,2,3),(4,5,6),(7,8,9)]\n", 184 | "for x,y,z in list_tup:\n", 185 | " print(x,y,z)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "8985477c-4a07-41b7-9b11-83895228230e", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3 (ipykernel)", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.11.5" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 5 218 | } 219 | -------------------------------------------------------------------------------- /intro_to_python/operators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "22adaa58-3dfa-4d00-9d9b-0796c993166f", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "10\n", 14 | "0\n", 15 | "25\n", 16 | "1.0\n", 17 | "3125\n" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "### MATH OPERATORS\n", 23 | "print(5 + 5) #addition\n", 24 | "print(5 - 5) #subtraction\n", 25 | "print(5 * 5) #multiplication\n", 26 | "print(5 / 5) #division\n", 27 | "print(5 ** 5) #exponent\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 4, 33 | "id": "de811a4c-c438-41fb-ab92-8592fe6d5b6c", 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "0\n", 41 | "2\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "## Modulo (remainder operator)\n", 47 | "print( 8 % 2) # If I divided 8 by 2, what is the remainder (should be none)\n", 48 | "print(8 % 3) # If I divided 8 by 3, what is the remainder (remainder of 2)\n", 49 | "\n", 50 | "## If module is anything other than 0, then the first number is not divisible by the second number" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 6, 56 | "id": "7296405b-0347-4c17-ada9-e94802abb234", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "it is even\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "## Is 10 even?\n", 69 | "if(10 % 2 == 0):\n", 70 | " print(\"it is even\")\n", 71 | "else:\n", 72 | " print(\"it is not even\")" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 8, 78 | "id": "c563b710-9088-41b2-a9aa-2f5baadb2427", 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "Hello World\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "# Concatenation (adding strings together)\n", 91 | "print(\"Hello\" + \" World\")" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 12, 97 | "id": "3d1b9f8f-7f54-451e-b31c-806ea53c3525", 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "HelloHelloHelloHelloHello\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "print(\"Hello\" * 5)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 15, 115 | "id": "23cd308e-e76e-44c0-8f75-c796bb989c98", 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "My name is Alex and I am 38\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "## Interpolation (injecting values into a string)\n", 128 | "name = \"Alex\"\n", 129 | "print(f\"My name is {name} and I am {30 + 8}\")" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 16, 135 | "id": "624db2fb-ad56-42d8-ad3b-e5c9866cca80", 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "\n", 143 | "Dear Alex,\n", 144 | "\n", 145 | "jfljsdfljsdflkjdflsdkjf;lkdjfa;ldkfja;ldkjfa;ldkjf\n", 146 | "\n", 147 | "\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "print(f\"\"\"\n", 153 | "Dear {name},\n", 154 | "\n", 155 | "jfljsdfljsdflkjdflsdkjf;lkdjfa;ldkfja;ldkjfa;ldkjf\n", 156 | "\n", 157 | "\"\"\")" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 17, 163 | "id": "613deff6-bcd7-4060-87d5-c311e635c33f", 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "### Boolean Operators" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 19, 173 | "id": "86ceb1c1-432b-4989-a366-45df21449352", 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "False\n", 181 | "True\n", 182 | "True\n", 183 | "False\n", 184 | "False\n", 185 | "True\n", 186 | "False\n", 187 | "True\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "print(1 > 2)\n", 193 | "print(1 < 2)\n", 194 | "print(1 <= 2)\n", 195 | "print(1 >= 2)\n", 196 | "print(1 == 2)\n", 197 | "print(1 != 2)\n", 198 | "print(1 != 2 and 1 != 1)\n", 199 | "print(1 != 2 or 1 != 1)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 20, 205 | "id": "49ae18d4-7346-4f5a-add6-377f8ffbffa7", 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "True\n", 213 | "False\n" 214 | ] 215 | }, 216 | { 217 | "ename": "NameError", 218 | "evalue": "name 'true' is not defined", 219 | "output_type": "error", 220 | "traceback": [ 221 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 222 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 223 | "Cell \u001b[0;32mIn[20], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(true)\n", 224 | "\u001b[0;31mNameError\u001b[0m: name 'true' is not defined" 225 | ] 226 | } 227 | ], 228 | "source": [ 229 | "print(True)\n", 230 | "print(False)\n", 231 | "print(true)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "id": "6c9d19bc-f76d-4937-9f48-e435c802fe16", 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [] 241 | } 242 | ], 243 | "metadata": { 244 | "kernelspec": { 245 | "display_name": "Python 3 (ipykernel)", 246 | "language": "python", 247 | "name": "python3" 248 | }, 249 | "language_info": { 250 | "codemirror_mode": { 251 | "name": "ipython", 252 | "version": 3 253 | }, 254 | "file_extension": ".py", 255 | "mimetype": "text/x-python", 256 | "name": "python", 257 | "nbconvert_exporter": "python", 258 | "pygments_lexer": "ipython3", 259 | "version": "3.11.5" 260 | } 261 | }, 262 | "nbformat": 4, 263 | "nbformat_minor": 5 264 | } 265 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/.ipynb_checkpoints/charts-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/.ipynb_checkpoints/director_dimension_table-checkpoint.csv: -------------------------------------------------------------------------------- 1 | director_id,director_name,birth_year 2 | 1,Christopher Nolan,1970 3 | 2,Lana Wachowski,1965 4 | 3,David Fincher,1962 5 | 4,Quentin Tarantino,1963 6 | 5,Robert Zemeckis,1952 7 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/.ipynb_checkpoints/genre_dimension_table-checkpoint.csv: -------------------------------------------------------------------------------- 1 | genre_id,genre_name 2 | 1,Action 3 | 2,Sci-Fi 4 | 3,Drama 5 | 4,Romance 6 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/.ipynb_checkpoints/introtopolars-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/.ipynb_checkpoints/movie_fact_table-checkpoint.csv: -------------------------------------------------------------------------------- 1 | id,movie_name,genre_id,director_id 2 | 1,Inception,1,1 3 | 2,The Matrix,2,2 4 | 3,Fight Club,3,3 5 | 4,Pulp Fiction,1,4 6 | 5,Forrest Gump,4,5 7 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/.ipynb_checkpoints/movies-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/.ipynb_checkpoints/read_join_csv-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/.ipynb_checkpoints/write_csv-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/director_dimension_table.csv: -------------------------------------------------------------------------------- 1 | director_id,director_name,birth_year 2 | 1,Christopher Nolan,1970 3 | 2,Lana Wachowski,1965 4 | 3,David Fincher,1962 5 | 4,Quentin Tarantino,1963 6 | 5,Robert Zemeckis,1952 7 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/genre_dimension_table.csv: -------------------------------------------------------------------------------- 1 | genre_id,genre_name 2 | 1,Action 3 | 2,Sci-Fi 4 | 3,Drama 5 | 4,Romance 6 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/introtopolars.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "d9d77858-80de-49df-b0cc-e1d912dd9056", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "shape: (2, 3)\n", 14 | "┌─────┬──────┬─────┐\n", 15 | "│ id ┆ name ┆ age │\n", 16 | "│ --- ┆ --- ┆ --- │\n", 17 | "│ i64 ┆ str ┆ i64 │\n", 18 | "╞═════╪══════╪═════╡\n", 19 | "│ 1 ┆ Alex ┆ 38 │\n", 20 | "│ 2 ┆ Tony ┆ 35 │\n", 21 | "└─────┴──────┴─────┘\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "import polars as pl\n", 27 | "\n", 28 | "data = {\"id\": [1,2], \"name\":[\"Alex\", \"Tony\"], \"age\": [38, 35]}\n", 29 | "\n", 30 | "df = pl.DataFrame(data)\n", 31 | "\n", 32 | "print(df)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 7, 38 | "id": "0951c1b2-10ed-43df-97de-0f5254606421", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "shape: (1, 1)\n", 46 | "┌─────┐\n", 47 | "│ age │\n", 48 | "│ --- │\n", 49 | "│ i64 │\n", 50 | "╞═════╡\n", 51 | "│ 73 │\n", 52 | "└─────┘\n", 53 | "shape: (1, 1)\n", 54 | "┌─────┐\n", 55 | "│ age │\n", 56 | "│ --- │\n", 57 | "│ i64 │\n", 58 | "╞═════╡\n", 59 | "│ 38 │\n", 60 | "└─────┘\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "total_age = df.select(pl.sum(\"age\"))\n", 66 | "max_age = df.select(pl.max(\"age\"))\n", 67 | "\n", 68 | "print(total_age)\n", 69 | "print(max_age)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 8, 75 | "id": "4691af92-d2f3-4cb9-9c8c-dedee660f65f", 76 | "metadata": { 77 | "scrolled": true 78 | }, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "shape: (1, 1)\n", 85 | "┌──────┐\n", 86 | "│ age │\n", 87 | "│ --- │\n", 88 | "│ f64 │\n", 89 | "╞══════╡\n", 90 | "│ 36.5 │\n", 91 | "└──────┘\n", 92 | "shape: (1, 1)\n", 93 | "┌─────┐\n", 94 | "│ age │\n", 95 | "│ --- │\n", 96 | "│ i64 │\n", 97 | "╞═════╡\n", 98 | "│ 35 │\n", 99 | "└─────┘\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "avg_age = df.select(pl.mean(\"age\"))\n", 105 | "min_age = df.select(pl.min(\"age\"))\n", 106 | "\n", 107 | "print(avg_age)\n", 108 | "print(min_age)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 9, 114 | "id": "6aad3d2e-e987-4d63-af6d-f24bb2cd8db7", 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "shape: (1, 3)\n", 122 | "┌─────┬──────┬─────┐\n", 123 | "│ id ┆ name ┆ age │\n", 124 | "│ --- ┆ --- ┆ --- │\n", 125 | "│ i64 ┆ str ┆ i64 │\n", 126 | "╞═════╪══════╪═════╡\n", 127 | "│ 2 ┆ Tony ┆ 35 │\n", 128 | "└─────┴──────┴─────┘\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "thirty_five = df.filter(pl.col(\"age\") == 35)\n", 134 | "print(thirty_five)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "5cc43b17-c5c4-4027-96e0-7683696bab98", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3 (ipykernel)", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.11.5" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 5 167 | } 168 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/movie_fact_table.csv: -------------------------------------------------------------------------------- 1 | id,movie_name,genre_id,director_id 2 | 1,Inception,1,1 3 | 2,The Matrix,2,2 4 | 3,Fight Club,3,3 5 | 4,Pulp Fiction,1,4 6 | 5,Forrest Gump,4,5 7 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/movies.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "50596b28-99c7-4107-9fe4-ff27ac28d5ed", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "shape: (15, 3)\n", 14 | "┌───────────────────────────────────┬─────────────────────────────────┬───────────┐\n", 15 | "│ Title ┆ Director ┆ Genre │\n", 16 | "│ --- ┆ --- ┆ --- │\n", 17 | "│ str ┆ str ┆ str │\n", 18 | "╞═══════════════════════════════════╪═════════════════════════════════╪═══════════╡\n", 19 | "│ The Shawshank Redemption ┆ Frank Darabont ┆ Drama │\n", 20 | "│ The Godfather ┆ Francis Ford Coppola ┆ Crime │\n", 21 | "│ The Dark Knight ┆ Christopher Nolan ┆ Action │\n", 22 | "│ 12 Angry Men ┆ Sidney Lumet ┆ Drama │\n", 23 | "│ … ┆ … ┆ … │\n", 24 | "│ Star Wars: Episode V - The Empir… ┆ Irvin Kershner ┆ Action │\n", 25 | "│ The Lord of the Rings: The Two T… ┆ Peter Jackson ┆ Adventure │\n", 26 | "│ The Matrix ┆ Lana Wachowski, Lilly Wachowski ┆ Action │\n", 27 | "│ Goodfellas ┆ Martin Scorsese ┆ Crime │\n", 28 | "└───────────────────────────────────┴─────────────────────────────────┴───────────┘\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "import polars as pl\n", 34 | "\n", 35 | "# Create a DataFrame with movie data\n", 36 | "data = {\n", 37 | " \"Title\": [\n", 38 | " \"The Shawshank Redemption\", \"The Godfather\", \"The Dark Knight\",\n", 39 | " \"12 Angry Men\", \"Schindler's List\", \"The Lord of the Rings: The Return of the King\",\n", 40 | " \"Pulp Fiction\", \"Forrest Gump\", \"Inception\", \"Fight Club\",\n", 41 | " \"The Lord of the Rings: The Fellowship of the Ring\", \"Star Wars: Episode V - The Empire Strikes Back\",\n", 42 | " \"The Lord of the Rings: The Two Towers\", \"The Matrix\", \"Goodfellas\"\n", 43 | " ],\n", 44 | " \"Director\": [\n", 45 | " \"Frank Darabont\", \"Francis Ford Coppola\", \"Christopher Nolan\",\n", 46 | " \"Sidney Lumet\", \"Steven Spielberg\", \"Peter Jackson\",\n", 47 | " \"Quentin Tarantino\", \"Robert Zemeckis\", \"Christopher Nolan\", \"David Fincher\",\n", 48 | " \"Peter Jackson\", \"Irvin Kershner\", \"Peter Jackson\", \"Lana Wachowski, Lilly Wachowski\", \"Martin Scorsese\"\n", 49 | " ],\n", 50 | " \"Genre\": [\n", 51 | " \"Drama\", \"Crime\", \"Action\",\n", 52 | " \"Drama\", \"Biography\", \"Adventure\",\n", 53 | " \"Crime\", \"Drama\", \"Action\", \"Drama\",\n", 54 | " \"Adventure\", \"Action\", \"Adventure\", \"Action\", \"Crime\"\n", 55 | " ]\n", 56 | "}\n", 57 | "\n", 58 | "df = pl.DataFrame(data)\n", 59 | "print(df)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "id": "7b56f2c5-5dea-462d-a527-470016d510e3", 66 | "metadata": { 67 | "scrolled": true 68 | }, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "Total count of movies: 15\n", 75 | "Count by genre:\n", 76 | "shape: (5, 2)\n", 77 | "┌───────────┬───────┐\n", 78 | "│ Genre ┆ Count │\n", 79 | "│ --- ┆ --- │\n", 80 | "│ str ┆ u32 │\n", 81 | "╞═══════════╪═══════╡\n", 82 | "│ Action ┆ 4 │\n", 83 | "│ Biography ┆ 1 │\n", 84 | "│ Crime ┆ 3 │\n", 85 | "│ Adventure ┆ 3 │\n", 86 | "│ Drama ┆ 4 │\n", 87 | "└───────────┴───────┘\n" 88 | ] 89 | }, 90 | { 91 | "name": "stderr", 92 | "output_type": "stream", 93 | "text": [ 94 | "/tmp/ipykernel_48364/904532218.py:5: DeprecationWarning: `groupby` is deprecated. It has been renamed to `group_by`.\n", 95 | " count_by_genre = df.groupby(\"Genre\").agg(pl.count(\"Title\").alias(\"Count\"))\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "# Run a count on the entire DataFrame\n", 101 | "total_count = df.height\n", 102 | "\n", 103 | "# Run a count by genre\n", 104 | "count_by_genre = df.groupby(\"Genre\").agg(pl.count(\"Title\").alias(\"Count\"))\n", 105 | "\n", 106 | "print(\"Total count of movies:\", total_count)\n", 107 | "print(\"Count by genre:\")\n", 108 | "print(count_by_genre)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "id": "e71930db-37a1-4eab-815f-4c198bc9989a", 115 | "metadata": { 116 | "scrolled": true 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "DataFrame with Director and Genre for Drama movies:\n", 124 | "shape: (4, 2)\n", 125 | "┌─────────────────┬───────┐\n", 126 | "│ Director ┆ Genre │\n", 127 | "│ --- ┆ --- │\n", 128 | "│ str ┆ str │\n", 129 | "╞═════════════════╪═══════╡\n", 130 | "│ Frank Darabont ┆ Drama │\n", 131 | "│ Sidney Lumet ┆ Drama │\n", 132 | "│ Robert Zemeckis ┆ Drama │\n", 133 | "│ David Fincher ┆ Drama │\n", 134 | "└─────────────────┴───────┘\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "# Create a data frame of just director and genre for drama movies\n", 140 | "drama_movies = df.filter(pl.col(\"Genre\") == \"Drama\").select([\"Director\", \"Genre\"])\n", 141 | "\n", 142 | "print(\"DataFrame with Director and Genre for Drama movies:\")\n", 143 | "print(drama_movies)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "83ee3f65-1264-4feb-8902-10487e9b15cf", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [] 153 | } 154 | ], 155 | "metadata": { 156 | "kernelspec": { 157 | "display_name": "Python 3 (ipykernel)", 158 | "language": "python", 159 | "name": "python3" 160 | }, 161 | "language_info": { 162 | "codemirror_mode": { 163 | "name": "ipython", 164 | "version": 3 165 | }, 166 | "file_extension": ".py", 167 | "mimetype": "text/x-python", 168 | "name": "python", 169 | "nbconvert_exporter": "python", 170 | "pygments_lexer": "ipython3", 171 | "version": "3.11.5" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 5 176 | } 177 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/read_join_csv.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "55c6076a-43fb-4da8-8d67-7eec4e01eba7", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "shape: (5, 4)\n", 14 | "┌─────┬──────────────┬───────────────────┬────────────┐\n", 15 | "│ id ┆ movie_name ┆ director_name ┆ genre_name │\n", 16 | "│ --- ┆ --- ┆ --- ┆ --- │\n", 17 | "│ i64 ┆ str ┆ str ┆ str │\n", 18 | "╞═════╪══════════════╪═══════════════════╪════════════╡\n", 19 | "│ 1 ┆ Inception ┆ Christopher Nolan ┆ Action │\n", 20 | "│ 2 ┆ The Matrix ┆ Lana Wachowski ┆ Sci-Fi │\n", 21 | "│ 3 ┆ Fight Club ┆ David Fincher ┆ Drama │\n", 22 | "│ 4 ┆ Pulp Fiction ┆ Quentin Tarantino ┆ Action │\n", 23 | "│ 5 ┆ Forrest Gump ┆ Robert Zemeckis ┆ Romance │\n", 24 | "└─────┴──────────────┴───────────────────┴────────────┘\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "import polars as pl\n", 30 | "\n", 31 | "# Load the CSV files\n", 32 | "movie_fact_df = pl.read_csv('movie_fact_table.csv')\n", 33 | "director_df = pl.read_csv('director_dimension_table.csv')\n", 34 | "genre_df = pl.read_csv('genre_dimension_table.csv')\n", 35 | "\n", 36 | "# Join the movie_fact_df with the director_df\n", 37 | "movie_director_df = movie_fact_df.join(director_df, on=\"director_id\", how=\"left\")\n", 38 | "\n", 39 | "# Join the resulting DataFrame with the genre_df\n", 40 | "full_movie_df = movie_director_df.join(genre_df, on=\"genre_id\", how=\"left\")\n", 41 | "\n", 42 | "# Display the joined DataFrame\n", 43 | "print(full_movie_df.select([\"id\", \"movie_name\", \"director_name\", \"genre_name\"]))" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "4a41d35e-afea-46a2-8828-0a4da380ff41", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [] 53 | } 54 | ], 55 | "metadata": { 56 | "kernelspec": { 57 | "display_name": "Python 3 (ipykernel)", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 3 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython3", 71 | "version": "3.11.5" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 5 76 | } 77 | -------------------------------------------------------------------------------- /intro_to_python/polars-exercises/write_csv.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "id": "da2bd6f6-1944-42cc-9fc2-5750bf68a1b0", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Creating sample data for the three CSV files\n", 11 | "import polars as pl\n", 12 | "\n", 13 | "# Movie Fact Table\n", 14 | "movie_fact_data = {\n", 15 | " \"id\": [1, 2, 3, 4, 5],\n", 16 | " \"movie_name\": [\"Inception\", \"The Matrix\", \"Fight Club\", \"Pulp Fiction\", \"Forrest Gump\"],\n", 17 | " \"genre_id\": [1, 2, 3, 1, 4],\n", 18 | " \"director_id\": [1, 2, 3, 4, 5]\n", 19 | "}\n", 20 | "movie_fact_df = pl.DataFrame(movie_fact_data)\n", 21 | "movie_fact_df.write_csv('movie_fact_table.csv', separator=\",\", include_header=True)\n", 22 | "\n", 23 | "# Director Dimension Table\n", 24 | "director_data = {\n", 25 | " \"director_id\": [1, 2, 3, 4, 5],\n", 26 | " \"director_name\": [\"Christopher Nolan\", \"Lana Wachowski\", \"David Fincher\", \"Quentin Tarantino\", \"Robert Zemeckis\"],\n", 27 | " \"birth_year\": [1970, 1965, 1962, 1963, 1952]\n", 28 | "}\n", 29 | "director_df = pl.DataFrame(director_data)\n", 30 | "director_df.write_csv('director_dimension_table.csv', separator=\",\", include_header=True)\n", 31 | "\n", 32 | "# Genre Dimension Table\n", 33 | "genre_data = {\n", 34 | " \"genre_id\": [1, 2, 3, 4],\n", 35 | " \"genre_name\": [\"Action\", \"Sci-Fi\", \"Drama\", \"Romance\"]\n", 36 | "}\n", 37 | "genre_df = pl.DataFrame(genre_data)\n", 38 | "genre_df.write_csv('genre_dimension_table.csv', separator=\",\", include_header=True)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "da43fabf-68a0-46b2-90e0-35ef62a1b7af", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [] 48 | } 49 | ], 50 | "metadata": { 51 | "kernelspec": { 52 | "display_name": "Python 3 (ipykernel)", 53 | "language": "python", 54 | "name": "python3" 55 | }, 56 | "language_info": { 57 | "codemirror_mode": { 58 | "name": "ipython", 59 | "version": 3 60 | }, 61 | "file_extension": ".py", 62 | "mimetype": "text/x-python", 63 | "name": "python", 64 | "nbconvert_exporter": "python", 65 | "pygments_lexer": "ipython3", 66 | "version": "3.11.5" 67 | } 68 | }, 69 | "nbformat": 4, 70 | "nbformat_minor": 5 71 | } 72 | -------------------------------------------------------------------------------- /intro_to_python/readme.md: -------------------------------------------------------------------------------- 1 | ## Create a Virtual Environment and Run Jupyterlab 2 | 3 | - Must have Python and Anaconda Installed 4 | 5 | ### With Anaconda 6 | 7 | **Create Environment** 8 | 9 | initialize conda if isn't already 10 | 11 | ```bash 12 | ### For Bash 13 | conda init bash 14 | ### For zsh 15 | conda init zsh 16 | ``` 17 | 18 | create an environment 19 | ``` 20 | conda create -n myenv python=3.11 21 | ``` 22 | 23 | activate the environment 24 | ``` 25 | conda activate myenv 26 | ``` 27 | 28 | To later deactivate an environment 29 | ``` 30 | conda deactivate 31 | ``` 32 | 33 | ** Run Notebook ** 34 | 35 | install dependencies 36 | ```bash 37 | conda install jupyter jupyterlab 38 | ``` 39 | 40 | run jupyter lab 41 | ```bash 42 | jupyter lab 43 | ``` 44 | 45 | ### With Pip 46 | 47 | create an environment 48 | ```bash 49 | python -m venv venv 50 | ``` 51 | 52 | activate environment 53 | ```bash 54 | source ./venv/bin/activate 55 | ``` 56 | 57 | install jupyter and jupyterlab 58 | ```bash 59 | pip install jupyter jupyterlab 60 | ``` 61 | 62 | run jupyterlab 63 | ``` 64 | jupyter lab 65 | ``` 66 | 67 | ## Installing Polars and Seaborn 68 | 69 | - turn of jupyter lab by hitting `ctrl + c` in your terminal 70 | 71 | - add conda forge to your conda install 72 | 73 | ```bash 74 | conda config --add channels conda-forge 75 | ``` 76 | 77 | - install libraries 78 | 79 | ```bash 80 | conda install polars seaborn 81 | ``` 82 | 83 | #### If using pip 84 | 85 | Just install 86 | 87 | ``` 88 | pip install seaborn polars 89 | ``` -------------------------------------------------------------------------------- /intro_to_python/scratch.py: -------------------------------------------------------------------------------- 1 | print("Hello World") -------------------------------------------------------------------------------- /intro_to_sql/scripts.md: -------------------------------------------------------------------------------- 1 | # Intro to SQL Code 2 | 3 | ### WHERE 4 | 5 | ```sql 6 | -- Create a table named 'students' with at least 5 columns 7 | CREATE TABLE students ( 8 | student_id INTEGER PRIMARY KEY, 9 | first_name TEXT, 10 | last_name TEXT, 11 | age INTEGER, 12 | gender TEXT, 13 | grade FLOAT 14 | ); 15 | 16 | -- Insert 5 records into the 'students' table 17 | INSERT INTO students (first_name, last_name, age, gender, grade) 18 | VALUES 19 | ('John', 'Doe', 18, 'Male', 90.5), 20 | ('Jane', 'Smith', 20, 'Female', 85.0), 21 | ('Alice', 'Johnson', 19, 'Female', 92.5), 22 | ('Bob', 'Williams', 21, 'Male', 78.5), 23 | ('Eva', 'Brown', 22, 'Female', 88.0); 24 | 25 | -- Example 1: Using WHERE with greater than 26 | SELECT * FROM students WHERE age > 20; 27 | 28 | -- Example 2: Using WHERE with BETWEEN 29 | SELECT * FROM students WHERE grade BETWEEN 85.0 AND 90.0; 30 | 31 | -- Example 3: Using WHERE with multiple conditions (AND) 32 | SELECT * FROM students WHERE age >= 18 AND gender = 'Female'; 33 | 34 | -- Example 4: Using WHERE with OR 35 | SELECT * FROM students WHERE age < 20 OR grade >= 90.0; 36 | ``` 37 | 38 | ### Aggregations 39 | 40 | ```sql 41 | -- Create a table named 'sales' with relevant columns 42 | CREATE TABLE sales ( 43 | id INTEGER PRIMARY KEY, 44 | salesperson TEXT, 45 | region TEXT, 46 | amount INTEGER, 47 | sale_date DATE 48 | ); 49 | 50 | -- Insert 5 records into the 'sales' table 51 | INSERT INTO sales (salesperson, region, amount, sale_date) 52 | VALUES 53 | ('Alice', 'North', 500, '2023-01-10'), 54 | ('Bob', 'South', 700, '2023-02-15'), 55 | ('Alice', 'East', 600, '2023-03-20'), 56 | ('Charlie', 'West', 300, '2023-04-25'), 57 | ('Alice', 'North', 800, '2023-05-30'); 58 | 59 | -- Aggregate Query Example 1: COUNT 60 | -- Count the total number of sales records 61 | SELECT COUNT(*) FROM sales; 62 | 63 | -- Aggregate Query Example 2: SUM 64 | -- Calculate the total sales amount 65 | SELECT SUM(amount) FROM sales; 66 | 67 | -- Aggregate Query Example 3: AVG (Average) 68 | -- Calculate the average sales amount 69 | SELECT AVG(amount) FROM sales; 70 | 71 | -- Aggregate Query Example 4: MAX 72 | -- Find the maximum sales amount 73 | SELECT MAX(amount) FROM sales; 74 | 75 | -- Aggregate Query Example 5: MIN 76 | -- Find the minimum sales amount 77 | SELECT MIN(amount) FROM sales; 78 | 79 | -- Aggregate Query Example 6: GROUP BY with COUNT 80 | -- Count the number of sales per salesperson 81 | SELECT salesperson, COUNT(*) AS num_sales FROM sales GROUP BY salesperson; 82 | 83 | -- Aggregate Query Example 7: GROUP BY with SUM 84 | -- Calculate the total sales amount per region 85 | SELECT region, SUM(amount) AS total_sales FROM sales GROUP BY region; 86 | 87 | -- Aggregate Query Example 8: GROUP BY with AVG 88 | -- Calculate the average sales amount per salesperson 89 | SELECT salesperson, AVG(amount) AS average_sales FROM sales GROUP BY salesperson; 90 | 91 | -- Aggregate Query Example 9: HAVING 92 | -- Find regions with a total sales amount greater than a certain value 93 | SELECT region, SUM(amount) AS total_sales FROM sales GROUP BY region HAVING SUM(amount) > 1000; 94 | ``` 95 | 96 | ### JOINS 97 | 98 | ```sql 99 | -- Create a table named 'employees' 100 | CREATE TABLE employees ( 101 | employee_id INTEGER PRIMARY KEY, 102 | employee_name TEXT, 103 | department_id INTEGER 104 | ); 105 | 106 | -- Create a table named 'departments' 107 | CREATE TABLE departments ( 108 | department_id INTEGER PRIMARY KEY, 109 | department_name TEXT 110 | ); 111 | 112 | -- Insert data into 'employees' 113 | INSERT INTO employees (employee_name, department_id) VALUES 114 | ('John Doe', 1), 115 | ('Jane Smith', 2), 116 | ('Alice Johnson', 3), 117 | ('Bob Williams', NULL), -- Bob's department is unknown 118 | ('Eva Brown', 2); 119 | 120 | -- Insert data into 'departments' 121 | INSERT INTO departments (department_id, department_name) VALUES 122 | (1, 'Human Resources'), 123 | (2, 'Marketing'), 124 | (3, 'Finance'), 125 | (4, 'IT'); 126 | 127 | -- Join Example 1: INNER JOIN 128 | -- Get the list of employees with their department names 129 | SELECT employees.employee_name, departments.department_name 130 | FROM employees 131 | INNER JOIN departments ON employees.department_id = departments.department_id; 132 | 133 | -- Join Example 2: LEFT JOIN (or LEFT OUTER JOIN) 134 | -- Get all employees and their department names, including those without a department 135 | SELECT employees.employee_name, departments.department_name 136 | FROM employees 137 | LEFT JOIN departments ON employees.department_id = departments.department_id; 138 | 139 | -- Join Example 3: RIGHT JOIN (or RIGHT OUTER JOIN) - Not supported in SQLite 140 | -- This would get all departments and their employees, including departments without employees 141 | -- (Right Joins are not supported in SQLite, but this can be achieved with a LEFT JOIN and switching the tables) 142 | 143 | -- Join Example 4: FULL OUTER JOIN - Not supported in SQLite 144 | -- This would get all employees and all departments, regardless of whether they match 145 | -- (Full Outer Joins are not supported in SQLite, but can be simulated with a UNION of LEFT JOIN and RIGHT JOIN) 146 | 147 | -- Join Example 5: CROSS JOIN 148 | -- Get a Cartesian product of employees and departments 149 | SELECT employees.employee_name, departments.department_name 150 | FROM employees 151 | CROSS JOIN departments; 152 | 153 | -- Join Example 6: SELF JOIN 154 | -- Join employees table to itself to find colleagues (employees in the same department) 155 | SELECT A.employee_name AS Employee1, B.employee_name AS Employee2 156 | FROM employees A, employees B 157 | WHERE A.department_id = B.department_id AND A.employee_id != B.employee_id; 158 | ``` 159 | 160 | ### INSERT INTO SELECT & CTAS 161 | 162 | ```sql 163 | -- Create an example table named 'products' 164 | CREATE TABLE products ( 165 | product_id INTEGER PRIMARY KEY, 166 | product_name TEXT, 167 | price DECIMAL(10, 2), 168 | category TEXT 169 | ); 170 | 171 | -- Insert data into 'products' 172 | INSERT INTO products (product_name, price, category) VALUES 173 | ('Laptop', 1200.00, 'Electronics'), 174 | ('Smartphone', 800.00, 'Electronics'), 175 | ('Desk Chair', 150.00, 'Furniture'), 176 | ('Table Lamp', 45.00, 'Furniture'), 177 | ('Bluetooth Headphones', 130.00, 'Electronics'); 178 | 179 | -- Create a new table 'electronics' with products from the 'Electronics' category 180 | CREATE TABLE electronics AS 181 | SELECT * 182 | FROM products 183 | WHERE category = 'Electronics'; 184 | 185 | -- Now, insert additional 'Electronics' products into the 'electronics' table 186 | -- by selecting from 'products' 187 | INSERT INTO electronics (product_id, product_name, price, category) 188 | SELECT product_id, product_name, price, category 189 | FROM products 190 | WHERE category = 'Electronics' AND price > 500; 191 | 192 | -- Select from the new 'electronics' table to verify 193 | SELECT * FROM electronics; 194 | ``` 195 | 196 | ### Common Table Expressions 197 | 198 | ```sql 199 | -- Create a table named 'employees' 200 | CREATE TABLE employees ( 201 | id INTEGER PRIMARY KEY, 202 | name TEXT, 203 | manager_id INTEGER 204 | ); 205 | 206 | -- Insert data into 'employees' 207 | INSERT INTO employees (id, name, manager_id) VALUES 208 | (1, 'Alice', NULL), -- Alice is the CEO 209 | (2, 'Bob', 1), -- Bob reports to Alice 210 | (3, 'Charlie', 1), -- Charlie also reports to Alice 211 | (4, 'David', 2), -- David reports to Bob 212 | (5, 'Eva', 3); -- Eva reports to Charlie 213 | 214 | -- Using a Common Table Expression (CTE) to find direct reports 215 | WITH DirectReports AS ( 216 | SELECT 217 | e1.name AS Employee, 218 | e2.name AS Manager 219 | FROM 220 | employees e1 221 | INNER JOIN 222 | employees e2 ON e1.manager_id = e2.id 223 | ) 224 | SELECT 225 | * 226 | FROM 227 | DirectReports; 228 | 229 | -- Another CTE example: Aggregating data 230 | -- Counting the number of direct reports each manager has 231 | WITH ReportCounts AS ( 232 | SELECT 233 | manager_id, 234 | COUNT(*) AS NumberOfReports 235 | FROM 236 | employees 237 | WHERE 238 | manager_id IS NOT NULL 239 | GROUP BY 240 | manager_id 241 | ) 242 | SELECT 243 | e.name AS Manager, 244 | rc.NumberOfReports 245 | FROM 246 | employees e 247 | INNER JOIN 248 | ReportCounts rc ON e.id = rc.manager_id; 249 | ``` 250 | 251 | ### Window Functions 252 | 253 | ```sql 254 | -- Create a table named 'sales' 255 | CREATE TABLE sales ( 256 | sale_id INTEGER PRIMARY KEY, 257 | salesperson TEXT, 258 | region TEXT, 259 | amount INTEGER, 260 | sale_date DATE 261 | ); 262 | 263 | -- Insert data into 'sales' 264 | INSERT INTO sales (salesperson, region, amount, sale_date) VALUES 265 | ('Alice', 'North', 300, '2023-01-10'), 266 | ('Bob', 'South', 500, '2023-01-15'), 267 | ('Alice', 'East', 450, '2023-01-20'), 268 | ('Charlie', 'North', 700, '2023-02-10'), 269 | ('Alice', 'South', 600, '2023-02-20'), 270 | ('Bob', 'East', 350, '2023-03-15'), 271 | ('Charlie', 'South', 500, '2023-04-10'), 272 | ('Alice', 'North', 400, '2023-05-15'); 273 | 274 | -- Window Function Example 1: ROW_NUMBER() 275 | -- Assigns a unique number to each row within the partition of a result set 276 | SELECT salesperson, region, amount, 277 | ROW_NUMBER() OVER (PARTITION BY region ORDER BY amount DESC) AS row_num 278 | FROM sales; 279 | 280 | -- Window Function Example 2: RANK() 281 | -- Assigns a rank to each row within a partition of a result set, with gaps in rank values 282 | SELECT salesperson, region, amount, 283 | RANK() OVER (PARTITION BY region ORDER BY amount DESC) AS rank 284 | FROM sales; 285 | 286 | -- Window Function Example 3: DENSE_RANK() 287 | -- Similar to RANK(), but without gaps in the rank values 288 | SELECT salesperson, region, amount, 289 | DENSE_RANK() OVER (PARTITION BY region ORDER BY amount DESC) AS dense_rank 290 | FROM sales; 291 | 292 | -- Window Function Example 4: SUM() as a Window Function 293 | -- Provides a running total within a partition 294 | SELECT salesperson, region, amount, 295 | SUM(amount) OVER (PARTITION BY region ORDER BY sale_date) AS running_total 296 | FROM sales; 297 | 298 | -- Window Function Example 5: AVG() as a Window Function 299 | -- Calculates the average within a partition 300 | SELECT salesperson, region, amount, 301 | AVG(amount) OVER (PARTITION BY region ORDER BY sale_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS avg_to_date 302 | FROM sales; 303 | 304 | -- Window Function Example 6: LEAD() and LAG() 305 | -- LEAD() provides access to a row at a given physical offset that follows the current row 306 | -- LAG() provides access to a row at a given physical offset that precedes the current row 307 | SELECT salesperson, region, amount, 308 | LAG(amount, 1) OVER (ORDER BY sale_date) AS prev_amount, 309 | LEAD(amount, 1) OVER (ORDER BY sale_date) AS next_amount 310 | FROM sales; 311 | ``` -------------------------------------------------------------------------------- /parquet/weather_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexMercedCoder/understanding_data_with_alex_merced/981d77b2eebffcc33ef5aa60c4512bb311bb4dfe/parquet/weather_data.parquet -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Understanding the Data World 2 | ## with Alex Merced 3 | 4 | #### Social 5 | - [Data Twitter](https://www.twitter.com/amdatalakehouse) 6 | - [Data Youtube](https://www.youtube.com/@alexmerceddata) 7 | - [Web Dev Twitter](https://www.twitter.com/alexmercedcoder) 8 | - [Web Dev Youtube](https://www.youtube.com/@alexmercedcoder) 9 | 10 | 11 | #### Playlists 12 | - [Intro to Data](https://www.youtube.com/playlist?list=PLsLAVBjQJO0p_4Nqz99tIjeoDYE97L0xY) 13 | - [Intro to SQL](https://www.youtube.com/playlist?list=PLsLAVBjQJO0r77mewO5mvglfy7FU6bJSM) 14 | - [Using SQL with Dremio & Apache Iceberg](https://www.youtube.com/playlist?list=PLsLAVBjQJO0p16lqzdehBewJvf3_c8XCK) 15 | - [Basics of Data Modeling](https://www.youtube.com/playlist?list=PLsLAVBjQJO0rAajAnmMUjwDGG0AXGSPp0) 16 | - [Data Concepts](https://www.youtube.com/playlist?list=PLsLAVBjQJO0roqGaqT9Mfnsz_nRuZnn4N) 17 | - [Apache Spark 101](https://www.youtube.com/playlist?list=PLsLAVBjQJO0rELsg9N8Agoq9Jg8wUUV_G) 18 | - [Apache Iceberg Lakehouse Engineering](https://www.youtube.com/playlist?list=PLsLAVBjQJO0rELsg9N8Agoq9Jg8wUUV_G) 19 | - [Understanding Data Optimization](https://www.youtube.com/playlist?list=PLsLAVBjQJO0rdNwoyFOTDzGhDcENESgg6) 20 | - [Intro to Python for Data People](https://www.youtube.com/playlist?list=PLsLAVBjQJO0pXBqhlD59MYtxl0E4XRj2J) 21 | - [Computer Science](https://www.youtube.com/playlist?list=PLY6oTPmKnKbbfwjU1ToZlUWCinxmFqlIp) 22 | 23 | #### Slack Community Invites 24 | - [Data Lakehouse Hub](https://join.slack.com/t/thedatalakehousehub/shared_invite/zt-274yc8sza-mI2zhCW8LGkOh1uxuf8T5Q) 25 | - [DataNation Community](https://join.slack.com/t/datanationcom-gti9492/shared_invite/zt-12xrk4qmd-y~6jUFFd7kdaLhgLURKwoA) 26 | - [DevNursery - Web Dev Community](https://join.slack.com/t/amwebdev/shared_invite/zt-9xlfgp6e-bGIv1zCc1x4Pl1Irm5jhrA) 27 | 28 | #### Other Links 29 | - [Substack of data related Articles](https://amdatalakehouse.substack.com/) 30 | - [REPL for Intro to SQL](https://replit.com/@AlexMercedCoder/SQL2024) 31 | - [Spreadsheet from Data Modeling Lessons](https://docs.google.com/spreadsheets/d/1bV3ORdo-CVsI-ZxPzOKvfPb4_ZB6nTfCOurZPPW8ILw/edit?usp=sharing) 32 | - [Spreadsheet for Data Optimization Lessons](https://docs.google.com/spreadsheets/d/1uMbpuyEr00ymbIIwgZxvF5T5sDHEO3LAUxG1ko1nIOo/edit?usp=sharing) 33 | - [Quick Guides from Dremio Repo](https://github.com/developer-advocacy-dremio/quick-guides-from-dremio) 34 | - [Data Lakehouse on your laptop blog](https://dev.to/alexmercedcoder/data-engineering-create-a-apache-iceberg-based-data-lakehouse-on-your-laptop-41a8) -------------------------------------------------------------------------------- /spark_scripts/file_system_table.md: -------------------------------------------------------------------------------- 1 | ## Spark/Iceberg Writing Tables to Local Filesystem 2 | 3 | ```py 4 | import pyspark 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql import Row 7 | 8 | conf = ( 9 | pyspark.SparkConf() 10 | .setAppName('iceberg_with_file_system') 11 | #packages 12 | .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.4.2') 13 | #SQL Extensions 14 | .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions') 15 | #Configuring Catalog 16 | .set('spark.sql.catalog.iceberg', 'org.apache.iceberg.spark.SparkCatalog') 17 | .set('spark.sql.catalog.iceberg.type', 'hadoop') 18 | .set('spark.sql.catalog.iceberg.warehouse', 'iceberg-warehouse') 19 | ) 20 | 21 | ## Start Spark Session 22 | spark = SparkSession.builder.config(conf=conf).getOrCreate() 23 | print("Spark Running") 24 | 25 | ## Run a Query 26 | spark.sql("CREATE TABLE IF NOT EXISTS iceberg.names (name STRING) USING iceberg;").show() 27 | 28 | ## Insert a record with SQL 29 | spark.sql("INSERT INTO iceberg.names VALUES ('Alex Merced')") 30 | 31 | ## Insert a record with dataframe api 32 | df = spark.createDataFrame([Row(name="Tony Merced")]) 33 | df.writeTo("iceberg.names").append() 34 | 35 | ## Querying the table 36 | spark.sql("SELECT * FROM iceberg.names;").show() 37 | ``` 38 | 39 | The provided Python script is for setting up and using Apache Spark with Iceberg, an open-source table format for huge analytic datasets. 40 | 41 | ```python 42 | import pyspark 43 | from pyspark.sql import SparkSession 44 | import os 45 | ``` 46 | 47 | The script starts by importing necessary modules: 48 | 49 | - pyspark: The main PySpark package. 50 | - SparkSession: Used for initializing the main entry point for DataFrame and SQL functionality. 51 | - os: This module provides a way of using operating system-dependent functionality (though it's not used in the script). 52 | 53 | ```python 54 | conf = ( 55 | pyspark.SparkConf() 56 | .setAppName('app_name') 57 | #packages 58 | .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.4.2') 59 | #SQL Extensions 60 | .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions') 61 | #Configuring Catalog 62 | .set('spark.sql.catalog.iceberg', 'org.apache.iceberg.spark.SparkCatalog') 63 | .set('spark.sql.catalog.iceberg.type', 'hadoop') 64 | .set('spark.sql.catalog.iceberg.warehouse', 'iceberg-warehouse') 65 | ) 66 | ``` 67 | 68 | This section creates a Spark configuration object (SparkConf): 69 | 70 | - `.setAppName('app_name')`: Sets the name of the application. 71 | - `.set('spark.jars.packages', ...)`: Specifies the Iceberg package for Spark runtime. 72 | - `.set('spark.sql.extensions', ...)`: Sets SQL extensions for Iceberg integration. 73 | - `.set('spark.sql.catalog.iceberg', ...)`: Configures the Spark catalog to use Iceberg. 74 | - `.set('spark.sql.catalog.iceberg.type', 'hadoop')`: Specifies that Iceberg uses the Hadoop catalog type. 75 | - `.set('spark.sql.catalog.iceberg.warehouse', 'iceberg-warehouse')`: Sets the location of the Iceberg warehouse. 76 | 77 | ```python 78 | Copy code 79 | ## Start Spark Session 80 | spark = SparkSession.builder.config(conf=conf).getOrCreate() 81 | print("Spark Running") 82 | ``` 83 | 84 | - Initializes a SparkSession with the above configuration. If a session already exists, it retrieves that session. 85 | 86 | - Prints "Spark Running" to confirm the Spark session is active. 87 | 88 | ```python 89 | ## Run a Query 90 | spark.sql("CREATE TABLE iceberg.names (name STRING) USING iceberg;").show() 91 | ``` 92 | 93 | - Executes a SQL query using Spark's SQL capabilities. 94 | 95 | - The query creates a new table named iceberg.names with a single column name of type STRING, using the Iceberg format. 96 | 97 | - `.show()` displays the results of the query (if any). In this case, it might show the status of the table creation. 98 | -------------------------------------------------------------------------------- /spark_scripts/nessie_ingest.md: -------------------------------------------------------------------------------- 1 | ```py 2 | import pyspark 3 | from pyspark.sql import SparkSession 4 | import os 5 | 6 | 7 | ## DEFINE SENSITIVE VARIABLES 8 | NESSIE_URI = "http://172.17.0.4:19120/api/v1" 9 | WAREHOUSE = "s3a://warehouse/" 10 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID") 11 | AWS_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") 12 | AWS_S3_ENDPOINT= "http://172.17.0.3:9000" 13 | 14 | 15 | 16 | 17 | 18 | conf = ( 19 | pyspark.SparkConf() 20 | .setAppName('app_name') 21 | .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.4.2,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.4_2.12:0.75.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178') 22 | .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions') 23 | .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog') 24 | .set('spark.sql.catalog.nessie.uri', NESSIE_URI) 25 | .set('spark.sql.catalog.nessie.ref', 'main') 26 | .set('spark.sql.catalog.nessie.authentication.type', 'NONE') 27 | .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog') 28 | .set('spark.sql.catalog.nessie.s3.endpoint', AWS_S3_ENDPOINT) 29 | .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE) 30 | .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO') 31 | ) 32 | 33 | 34 | ## Start Spark Session 35 | spark = SparkSession.builder.config(conf=conf).getOrCreate() 36 | print("Spark Running") 37 | 38 | # Read the file 39 | parquet_file_path = "../sampledata/Worker_Coops.csv" 40 | df = spark.read.format("csv").option("header", "true").load(parquet_file_path) 41 | 42 | # Create a temporary view using the DataFrame 43 | df.createOrReplaceTempView("wv_view") 44 | 45 | ## Create a Table with matching schema but no records 46 | spark.sql("CREATE TABLE IF NOT EXISTS nessie.worker_coop AS (SELECT * FROM wv_view LIMIT 0)").show() 47 | 48 | ## Create a Branch 49 | spark.sql("CREATE BRANCH IF NOT EXISTS dev IN nessie") 50 | 51 | ## Use Branch 52 | spark.sql("USE REFERENCE dev IN nessie") 53 | 54 | ## Insert Some Data 55 | spark.sql("INSERT INTO nessie.worker_coop SELECT * FROM wv_view").show() 56 | 57 | ## Query the Data 58 | spark.sql("SELECT * FROM nessie.worker_coop").show() 59 | 60 | ## Use Branch 61 | spark.sql("USE REFERENCE main IN nessie") 62 | 63 | ## Query the Data 64 | spark.sql("SELECT * FROM nessie.worker_coop").show() 65 | 66 | ## Merge the Data 67 | spark.sql("MERGE BRANCH dev INTO main IN nessie") 68 | 69 | ######### 70 | 71 | ## Query the Data 72 | spark.sql("SELECT * FROM nessie.worker_coop").show() 73 | ``` -------------------------------------------------------------------------------- /spark_scripts/nessie_setup.md: -------------------------------------------------------------------------------- 1 | The Following Environmental Variables are needed on your spark container: 2 | 3 | ```bash 4 | # AWS_REGION is used by Spark 5 | AWS_REGION=us-east-1 6 | # Used by pyIceberg 7 | AWS_DEFAULT_REGION=us-east-1 8 | # AWS Credentials (this can use minio credential, to be filled in later) 9 | AWS_ACCESS_KEY_ID=XXXXXXXXXXXXXXX 10 | AWS_SECRET_ACCESS_KEY=xxxxxxx 11 | ``` 12 | 13 | AND THE FOLLOWING ENVIRONMENT VARIABLES IN THE MINIO CONTAINER 14 | 15 | ```bash 16 | MINIO_ROOT_USER=admin 17 | MINIO_ROOT_PASSWORD=password 18 | MINIO_REGION=us-east-1 19 | MINIO_REGION_NAME=us-east-1 20 | ``` 21 | 22 | ```py 23 | import pyspark 24 | from pyspark.sql import SparkSession 25 | import os 26 | 27 | 28 | ## DEFINE SENSITIVE VARIABLES 29 | NESSIE_URI = "http://172.17.0.4:19120/api/v1" 30 | WAREHOUSE = "s3a://warehouse/" 31 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID") 32 | AWS_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") 33 | AWS_S3_ENDPOINT= "http://172.17.0.3:9000" 34 | 35 | 36 | 37 | 38 | 39 | conf = ( 40 | pyspark.SparkConf() 41 | .setAppName('app_name') 42 | .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.4.2,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.4_2.12:0.75.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178') 43 | .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions') 44 | .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog') 45 | .set('spark.sql.catalog.nessie.uri', NESSIE_URI) 46 | .set('spark.sql.catalog.nessie.ref', 'main') 47 | .set('spark.sql.catalog.nessie.authentication.type', 'NONE') 48 | .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog') 49 | .set('spark.sql.catalog.nessie.s3.endpoint', AWS_S3_ENDPOINT) 50 | .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE) 51 | .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO') 52 | ) 53 | 54 | 55 | ## Start Spark Session 56 | spark = SparkSession.builder.config(conf=conf).getOrCreate() 57 | print("Spark Running") 58 | 59 | 60 | ## Create a Table 61 | spark.sql("CREATE TABLE nessie.testnames (name STRING) USING iceberg;").show() 62 | 63 | 64 | ## Insert Some Data 65 | spark.sql("INSERT INTO nessie.testnames VALUES ('Alex Merced'), ('Tomer Shiran'), ('Jason Hughes')").show() 66 | 67 | 68 | ## Query the Data 69 | spark.sql("SELECT * FROM nessie.testnames;").show() 70 | ``` -------------------------------------------------------------------------------- /spark_scripts/pyspark_basics.md: -------------------------------------------------------------------------------- 1 | ```py 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql import Row 4 | 5 | # Initialize a SparkSession 6 | spark = SparkSession.builder \ 7 | .appName("PySpark SQL and DataFrame API Example") \ 8 | .getOrCreate() 9 | 10 | # Create a simple DataFrame 11 | data = [Row(name="Alice", age=25), 12 | Row(name="Bob", age=30), 13 | Row(name="Charlie", age=35)] 14 | 15 | df = spark.createDataFrame(data) 16 | 17 | # Show the DataFrame 18 | print("DataFrame:") 19 | df.show() 20 | 21 | # Writing DataFrame to a temporary view 22 | df.createOrReplaceTempView("people") 23 | 24 | # Using SQL API to query the DataFrame 25 | print("SQL API output:") 26 | spark.sql("SELECT * FROM people WHERE age > 28").show() 27 | 28 | # Alternatively, using DataFrame API to perform the same query 29 | print("DataFrame API output:") 30 | df.filter(df.age > 28).show() 31 | 32 | # Stop the SparkSession 33 | spark.stop() 34 | ``` 35 | 36 | ## Explanation 37 | **SparkSession Initialization:** Creates an instance of SparkSession, which is the entry point for Spark functionality. 38 | 39 | **Creating a DataFrame:** A simple DataFrame is created from a list of Row objects. This DataFrame contains two columns, name and age. 40 | 41 | **DataFrame Operations:** The DataFrame is shown using show() method. 42 | 43 | **Writing DataFrame to Temp View:** The DataFrame is written to a temporary view (in-memory table) named people, allowing it to be queried using SQL syntax. 44 | 45 | **Reading with SQL API:** The script uses spark.sql() to run an SQL query on the temporary view, selecting people older than 28. 46 | 47 | **Reading with DataFrame API:** The script also demonstrates achieving the same result using the DataFrame API, using the filter() method. 48 | 49 | **Stopping SparkSession:** Finally, the script stops the SparkSession, releasing the resources. 50 | 51 | This script serves as a basic introduction to using both SQL and DataFrame APIs in PySpark for simple data processing tasks. Make sure you have a Spark environment set up to run this script. -------------------------------------------------------------------------------- /sql_using_dremio/CASE.md: -------------------------------------------------------------------------------- 1 | ```sql 2 | -- Create a table named 'destinations' 3 | CREATE TABLE destinations ( 4 | id INTEGER, 5 | name VARCHAR, 6 | category VARCHAR 7 | ); 8 | 9 | -- Insert data into 'destinations' 10 | INSERT INTO destinations (id, name, category) VALUES 11 | (1, 'Eiffel Tower', 'Attraction'), 12 | (2, 'The Ritz', 'Hotels'), 13 | (3, 'Joes Diner', 'Restaurants'), 14 | (4, 'Central Park', 'Parks'), 15 | (5, 'Grand Hotel', 'Hotels'); 16 | 17 | -- Run a query using the CASE statement 18 | -- This will categorize destinations based on their type 19 | SELECT 20 | name, 21 | CASE category 22 | WHEN 'Restaurants' THEN 'food' 23 | WHEN 'Hotels' THEN 'travel' 24 | WHEN 'Attraction' THEN 'sightseeing' 25 | WHEN 'Parks' THEN 'recreation' 26 | ELSE 'other' 27 | END AS category_type 28 | FROM 29 | destinations; 30 | ``` 31 | 32 | This code block first creates a destinations table with columns id, name, and category. Then, it populates the table with various destinations, each belonging to a category like 'Restaurants', 'Hotels', etc. Finally, it demonstrates the use of the CASE statement to categorize these destinations into broader types such as 'food', 'travel', 'sightseeing', and 'recreation'. This example shows how CASE can be used to transform and categorize data based on specific conditions. -------------------------------------------------------------------------------- /sql_using_dremio/MEDIAN.md: -------------------------------------------------------------------------------- 1 | ```sql 2 | -- Step 1: Create a table and populate it with sample data 3 | CREATE TABLE sample_data ( 4 | id INTEGER, 5 | val INTEGER 6 | ); 7 | 8 | -- Insert sample data into the table 9 | INSERT INTO sample_data (id, val) VALUES 10 | (1, 1), 11 | (2, 20), 12 | (3, 30), 13 | (4, 40), 14 | (5, 200); 15 | 16 | -- Step 2: Use the MEDIAN function to compute the median 17 | SELECT MEDIAN(val) AS median_value, AVG(val) as avg_value FROM sample_data; 18 | ``` -------------------------------------------------------------------------------- /sql_using_dremio/MERGE_INTO.md: -------------------------------------------------------------------------------- 1 | ```sql 2 | -- Step 1: Create and load the 'products' table 3 | CREATE TABLE products ( 4 | product_id INTEGER, 5 | name VARCHAR, 6 | price DECIMAL, 7 | category VARCHAR 8 | ); 9 | 10 | INSERT INTO products (product_id, name, price, category) VALUES 11 | (1, 'Widget A', 19.99, 'Widgets'), 12 | (2, 'Widget B', 29.99, 'Widgets'), 13 | (3, 'Gadget A', 14.99, 'Gadgets'); 14 | 15 | -- Step 2: Create and load the 'product_updates' table 16 | CREATE TABLE product_updates ( 17 | product_id INTEGER, 18 | name VARCHAR, 19 | price DECIMAL, 20 | category VARCHAR 21 | ); 22 | 23 | INSERT INTO product_updates (product_id, name, price, category) VALUES 24 | (2, 'Widget B Plus', 34.99, 'Widgets'), -- Updated product 25 | (3, 'Gadget A', 14.99, 'Gadgets'), -- Unchanged product 26 | (4, 'Gadget B', 24.99, 'Gadgets'); -- New product 27 | 28 | -- Step 3: Run the MERGE statement 29 | MERGE INTO products AS p 30 | USING product_updates AS u 31 | ON (p.product_id = u.product_id) 32 | WHEN MATCHED THEN 33 | UPDATE SET name = u.name, price = u.price, category = u.category 34 | WHEN NOT MATCHED THEN 35 | INSERT (product_id, name, price, category) VALUES (u.product_id, u.name, u.price, u.category); 36 | ``` 37 | 38 | Two tables are created: products and product_updates. 39 | products is populated with initial product data. 40 | product_updates is populated with a mix of updated, unchanged, and new product data. 41 | The MERGE INTO statement then checks for matches based on product_id. 42 | If a match is found (indicating an update), it updates the corresponding record in products with the data from product_updates. 43 | If no match is found (indicating a new product), it inserts a new record into products with the data from product_updates. 44 | -------------------------------------------------------------------------------- /sql_using_dremio/STUDENTS.md: -------------------------------------------------------------------------------- 1 | ```sql 2 | -- Step 1: Create a table for students 3 | CREATE TABLE student_records ( 4 | name VARCHAR(50), 5 | age INT, 6 | current_score DECIMAL(4, 2), 7 | favorite_class VARCHAR(50), 8 | homeroom_teacher VARCHAR(50) 9 | ); 10 | 11 | -- Step 2: Insert 50 random student records 12 | INSERT INTO student_records (name, age, current_score, favorite_class, homeroom_teacher) 13 | VALUES 14 | ('Alice', 18, 95.75, 'Math', 'Mr. Johnson'), 15 | ('Bob', 17, 88.50, 'Science', 'Mrs. Smith'), 16 | ('Charlie', 16, 92.25, 'History', 'Mr. Davis'), 17 | ('David', 17, 89.75, 'English', 'Ms. Wilson'), 18 | ('Emily', 18, 96.00, 'Math', 'Mr. Johnson'), 19 | ('Frank', 16, 91.25, 'Science', 'Mrs. Smith'), 20 | ('Grace', 17, 87.75, 'History', 'Mr. Davis'), 21 | ('Hannah', 18, 94.50, 'English', 'Ms. Wilson'), 22 | ('Isaac', 16, 90.25, 'Math', 'Mr. Johnson'), 23 | ('Jacob', 17, 88.00, 'Science', 'Mrs. Smith'), 24 | ('Katherine', 18, 93.00, 'History', 'Mr. Davis'), 25 | ('Liam', 16, 91.75, 'English', 'Ms. Wilson'), 26 | ('Mia', 17, 89.50, 'Math', 'Mr. Johnson'), 27 | ('Noah', 18, 95.25, 'Science', 'Mrs. Smith'), 28 | ('Olivia', 16, 92.00, 'History', 'Mr. Davis'), 29 | ('Sophia', 17, 87.25, 'English', 'Ms. Wilson'), 30 | ('William', 18, 94.00, 'Math', 'Mr. Johnson'), 31 | ('Student50', 17, 88.75, 'History', 'Mr. Davis'), 32 | ('Natalie', 16, 93.25, 'Math', 'Mr. Johnson'), 33 | ('Oliver', 17, 89.00, 'Science', 'Mrs. Smith'), 34 | ('Penelope', 18, 95.50, 'History', 'Mr. Davis'), 35 | ('Quinn', 16, 90.00, 'English', 'Ms. Wilson'), 36 | ('Ryan', 17, 87.00, 'Math', 'Mr. Johnson'), 37 | ('Samantha', 18, 94.75, 'Science', 'Mrs. Smith'), 38 | ('Thomas', 16, 91.00, 'History', 'Mr. Davis'), 39 | ('Uma', 17, 88.25, 'English', 'Ms. Wilson'), 40 | ('Victor', 18, 96.25, 'Math', 'Mr. Johnson'), 41 | ('Willa', 16, 92.50, 'Science', 'Mrs. Smith'), 42 | ('Xander', 17, 86.50, 'History', 'Mr. Davis'), 43 | ('Yara', 18, 94.00, 'English', 'Ms. Wilson'), 44 | ('Zachary', 16, 91.75, 'Math', 'Mr. Johnson'), 45 | ('Ava', 17, 89.25, 'Science', 'Mrs. Smith'), 46 | ('Benjamin', 18, 95.00, 'History', 'Mr. Davis'), 47 | ('Chloe', 16, 90.75, 'English', 'Ms. Wilson'), 48 | ('Daniel', 17, 88.50, 'Math', 'Mr. Johnson'), 49 | ('Emma', 18, 94.50, 'Science', 'Mrs. Smith'), 50 | ('Fiona', 16, 92.25, 'History', 'Mr. Davis'), 51 | ('Gabriel', 17, 87.75, 'English', 'Ms. Wilson'), 52 | ('Haley', 18, 96.75, 'Math', 'Mr. Johnson'), 53 | ('Isabella', 16, 92.00, 'Science', 'Mrs. Smith'), 54 | ('Jackson', 17, 89.50, 'History', 'Mr. Davis'), 55 | ('Kylie', 18, 95.25, 'English', 'Ms. Wilson'), 56 | ('Landon', 16, 91.00, 'Math', 'Mr. Johnson'), 57 | ('Mila', 17, 87.00, 'Science', 'Mrs. Smith'), 58 | ('Nathan', 18, 94.75, 'History', 'Mr. Davis'), 59 | ('Olivia', 16, 90.00, 'English', 'Ms. Wilson'), 60 | ('Parker', 17, 88.00, 'Math', 'Mr. Johnson'), 61 | ('Quinn', 18, 93.75, 'Science', 'Mrs. Smith'), 62 | ('Riley', 16, 91.50, 'History', 'Mr. Davis'); 63 | ``` 64 | --------------------------------------------------------------------------------