├── README.md ├── license ├── notes ├── 01_introduction_to_databases │ ├── 01_databases_intro.md │ ├── 02_types_of_databases.md │ ├── 03_database_management_systems_dbms_.md │ ├── 04_data_models.md │ └── 05_glossary.md ├── 02_database_design │ ├── 01_requirements_analysis.md │ ├── 02_normalization.md │ ├── 03_denormalization.md │ ├── 04_indexing_strategies.md │ └── 05_data_integrity.md ├── 03_sql │ ├── 01_intro_to_sql.md │ ├── 02_data_definition_language_ddl.md │ ├── 03_data_manipulation_language_dml.md │ ├── 04_data_control_language_dcl.md │ ├── 05_transaction_control_language_tcl.md │ ├── 06_joins_subqueries_and_views.md │ ├── 07_stored_procedures_and_functions.md │ ├── 08_triggers.md │ ├── 09_hierarchical_data.md │ ├── 10_aggregate_functions.md │ └── 11_window_functions.md ├── 04_acid_properties_and_transactions │ ├── 01_transactions_intro.md │ ├── 02_atomicity.md │ ├── 03_consistency.md │ ├── 04_isolation.md │ └── 05_durability.md ├── 05_storage_and_indexing │ ├── 01_how_tables_and_indexes_are_stored_on_disk.md │ ├── 02_row_based_vs_column_based_databases.md │ ├── 03_primary_key_vs_secondary_key.md │ ├── 04_database_pages.md │ └── 05_indexing.md ├── 06_distributed_databases │ ├── 01_distributed_database_systems.md │ ├── 02_partitioning.md │ ├── 03_sharding.md │ ├── 04_partitioning_vs_sharding.md │ ├── 05_consistent_hashing.md │ ├── 06_cap_theorem.md │ ├── 07_eventual_consistency.md │ └── 08_distributed_database_systems.md ├── 07_concurrency_control │ ├── 01_shared_vs_exclusive_locks.md │ ├── 02_deadlocks.md │ ├── 03_two_phase_locking.md │ ├── 04_double_booking_problem.md │ └── 05_serializable_vs_repeatable_read.md ├── 08_database_performance │ ├── 01_query_optimization_techniques.md │ ├── 02_indexing_strategies.md │ ├── 03_database_caching.md │ ├── 04_materialized_views.md │ ├── 05_accessing_database_in_code.md │ └── 06_working_with_billion_row_table.md ├── 09_database_replication │ ├── 01_intro_to_replication.md │ ├── 02_master_standby_replication.md │ ├── 03_multi_master_replication.md │ └── 04_synchronous_vs_asynchronous_replication.md ├── 10_nosql_databases │ ├── 01_nosql_databases_intro.md │ ├── 02_types_of_nosql_databases.md │ ├── 03_querying_nosql_databases.md │ └── 04_crud_in_sql_vs_nosql.md ├── 11_security_best_practices │ ├── 01_backup_and_recovery_strategies.md │ ├── 02_database_security.md │ ├── 03_capacity_planning.md │ ├── 04_database_migration.md │ ├── 05_performance_monitoring_and_tuning.md │ ├── 06_sql_injection.md │ └── 07_crash_recovery_in_databases.md ├── 12_database_engines │ ├── 01_sqlite.md │ ├── 02_mysql.md │ ├── 03_postgresql.md │ ├── 04_mongodb.md │ ├── 05_neo4j.md │ ├── 06_aws_services.md │ └── 07_choosing_database.md ├── 13_big_data │ ├── 01_data_warehousing.md │ ├── 02_hadoop_and_hdfs.md │ └── 03_spark_sql.md ├── 14_orm │ ├── 01_introduction_to_orm.md │ └── 02_popular_orm_tools.md └── quizes.md └── scripts ├── concurrency ├── concurrent_readers.py ├── deadlock_file_level.py ├── deadlock_row_level.py ├── mvcc.py ├── optimistic_vs_pesimistic_lock.py └── transaction_isolation.py ├── create_mock_db.py ├── diagrams └── hash_ring.py ├── generating_query_strings ├── create_table.py ├── delete.py ├── insert_query.py ├── select.py └── update_query.py └── setup └── start_postgres.sh /license: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Adam Djellouli 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /notes/01_introduction_to_databases/01_databases_intro.md: -------------------------------------------------------------------------------- 1 | ## Introduction to Databases 2 | 3 | Databases are the backbone of modern applications, serving as organized repositories where data is stored, managed, and retrieved efficiently. Think of a database as a digital library where information is cataloged systematically, making it easy to find and use. Whether it's a simple contact list on your phone or a massive system powering a social media platform, databases play a crucial role in handling data effectively. 4 | 5 | ``` 6 | +-------------------------------------------------------------+ 7 | | Database | 8 | |-------------------------------------------------------------| 9 | | [ Tables ] | 10 | | | 11 | | +----------------+ +--------------+ +--------------+ | 12 | | | Users | | Orders | | Products | | 13 | | +----------------+ +--------------+ +--------------+ | 14 | | | UserID | | OrderID | | ProductID | | 15 | | | Name | | UserID | | Name | | 16 | | | Email | | Date | | Price | | 17 | | +----------------+ +--------------+ +--------------+ | 18 | | | 19 | | [ Relationships ] | 20 | | | 21 | | Users.UserID <--------> Orders.UserID | 22 | | Orders.ProductID <-----> Products.ProductID | 23 | +-------------------------------------------------------------+ 24 | ``` 25 | 26 | After reading the material, you should be able to answer the following questions: 27 | 28 | 1. What are the fundamental components of a database, and how do tables, fields, records, and relationships work together to organize data? 29 | 2. What are the key advantages of using databases over simpler data storage methods like text files or spreadsheets? 30 | 3. How can you perform basic SQL operations such as creating tables, inserting data, querying records, updating entries, and deleting records? 31 | 4. What are the different types of relationships between tables, and how do SQL JOIN operations facilitate the retrieval of related data across multiple tables? 32 | 5. What are the various types of databases available (e.g., relational, NoSQL, in-memory), and what are their specific use cases and benefits? 33 | 34 | ### Understanding Databases 35 | 36 | At its simplest, a database is a collection of information organized in a way that allows for easy access and management. Databases enable applications to store data persistently, ensuring that information remains available even after the application is closed or the system is restarted. 37 | 38 | #### Components of a Database 39 | 40 | - The **tables in a database** serve as the foundational structures, organizing data into rows and columns similar to a spreadsheet. Each table represents a distinct entity, such as users, orders, or products. 41 | - **Fields, also known as columns**, define the type of data stored in a table. For instance, a "Users" table might include fields such as UserID, Name, and Email. 42 | - The **records, represented by rows**, are individual entries in a table, each containing data about a specific item or entity. 43 | - **Relationships between tables** establish connections, enabling data to be linked and referenced across the database for better organization and retrieval. 44 | 45 | ### Why Use a Database? 46 | 47 | Databases offer several advantages over simpler data storage methods like text files or spreadsheets: 48 | 49 | - Databases are **designed for efficient data management**, handling large volumes of information seamlessly to enable quick retrieval and updates. 50 | - Built-in rules and constraints in databases ensure **data integrity**, maintaining accuracy and consistency across all records. 51 | - Organizations rely on **robust security features** in databases to safeguard sensitive information and control access. 52 | - The **scalability of databases** allows them to grow with an application's needs, accommodating increasing data volumes and user demands. 53 | - Databases enable **concurrent access** by multiple users, allowing simultaneous data modifications without causing conflicts. 54 | - With **powerful querying capabilities**, databases support complex queries and aggregations, facilitating comprehensive data analysis. 55 | - Structured mechanisms ensure **reliable transaction processing**, maintaining data consistency even in multi-step operations. 56 | - **Automated backup and recovery features** in databases protect against data loss, ensuring business continuity. 57 | - The **customizability of database designs** allows tailored solutions to meet specific organizational needs. 58 | - Through **integration capabilities**, databases can connect seamlessly with other software and systems, enhancing workflow efficiency. 59 | - **Advanced indexing techniques** improve search performance, ensuring fast access to required information. 60 | - Comprehensive tools for **reporting and analytics** empower decision-makers with actionable insights from raw data. 61 | - By supporting **different data formats**, databases accommodate both structured and unstructured information effectively. 62 | - **Data redundancy is minimized** through normalization, reducing storage requirements and maintaining consistency. 63 | 64 | ### Interacting with Databases 65 | 66 | To communicate with a database, we use a language called SQL (Structured Query Language). SQL provides commands to perform various operations like creating tables, inserting data, querying, updating, and deleting records. 67 | 68 | #### Basic SQL Operations 69 | 70 | I. **Creating a Table** 71 | 72 | To define a new table in the database: 73 | 74 | ```sql 75 | CREATE TABLE Users ( 76 | UserID INT PRIMARY KEY, 77 | Name VARCHAR(100), 78 | Email VARCHAR(100) 79 | ); 80 | ``` 81 | 82 | This command creates a "Users" table with three fields: UserID, Name, and Email. 83 | 84 | II. **Inserting Data** 85 | 86 | To add a new record to a table: 87 | 88 | ```sql 89 | INSERT INTO Users (UserID, Name, Email) 90 | VALUES (1, 'Alice Smith', 'alice@example.com'); 91 | ``` 92 | 93 | This inserts a new user into the "Users" table. 94 | 95 | III. **Querying Data** 96 | 97 | To retrieve data from a table: 98 | 99 | ```sql 100 | SELECT * FROM Users; 101 | ``` 102 | 103 | **Output:** 104 | 105 | | UserID | Name | Email | 106 | |--------|--------------|-------------------| 107 | | 1 | Alice Smith | alice@example.com | 108 | 109 | This command fetches all records from the "Users" table. 110 | 111 | IV. **Updating Data** 112 | 113 | To modify existing data: 114 | 115 | ```sql 116 | UPDATE Users 117 | SET Email = 'alice.smith@example.com' 118 | WHERE UserID = 1; 119 | ``` 120 | 121 | This updates Alice's email address in the "Users" table. 122 | 123 | V. **Deleting Data** 124 | 125 | To remove a record: 126 | 127 | ```sql 128 | DELETE FROM Users 129 | WHERE UserID = 1; 130 | ``` 131 | 132 | This deletes the user with UserID 1 from the "Users" table. 133 | 134 | ### Relationships Between Tables 135 | 136 | Establishing relationships between tables allows for more complex and meaningful data queries. The most common types of relationships are one-to-one, one-to-many, and many-to-many. 137 | 138 | #### One-to-Many Relationship 139 | 140 | An example is a user who can place multiple orders: 141 | 142 | ``` 143 | +-----------+ +-----------+ 144 | | Users | | Orders | 145 | +-----------+ +-----------+ 146 | | UserID | | OrderID | 147 | | Name | | UserID | 148 | | Email | | Date | 149 | +-----------+ +-----------+ 150 | ``` 151 | 152 | The "Orders" table references the "Users" table through the UserID field, indicating which user placed each order. 153 | 154 | #### Joining Tables 155 | 156 | To retrieve data that spans multiple tables, we use SQL JOIN operations. 157 | 158 | ##### Example: Retrieving User Orders 159 | 160 | ```sql 161 | SELECT Users.Name, Orders.OrderID, Orders.Date 162 | FROM Users 163 | JOIN Orders ON Users.UserID = Orders.UserID; 164 | ``` 165 | 166 | **Output:** 167 | 168 | | Name | OrderID | Date | 169 | |-------------|---------|------------| 170 | | Alice Smith | 1001 | 2024-02-01 | 171 | | Bob Jones | 1002 | 2024-02-02 | 172 | 173 | This query combines data from the "Users" and "Orders" tables to show which orders were placed by each user. 174 | 175 | ### Real-World Analogy 176 | 177 | Imagine a database as a warehouse filled with filing cabinets: 178 | 179 | - The **warehouse in a database** represents the entire collection of data, encompassing all the stored information. 180 | - **Filing cabinets in the form of tables** provide organized storage units, categorizing data into distinct types of records for easy management. 181 | - **Folders, corresponding to records**, hold individual pieces of information, each relating to a specific item or entity. 182 | - The **labels, reflected as fields**, act as identifiers, describing the contents and attributes of each folder within the structure. 183 | 184 | This structure allows anyone to find specific information quickly, much like a well-organized database facilitates efficient data retrieval. 185 | 186 | ### Types of Databases 187 | 188 | While relational databases using SQL are common, there are other types of databases designed for specific needs. 189 | 190 | #### Relational Databases 191 | 192 | - Use tables to store data. 193 | - Employ SQL for data manipulation. 194 | - Ideal for structured data with clear relationships. 195 | - Examples: MySQL, PostgreSQL, Oracle. 196 | 197 | #### NoSQL Databases 198 | 199 | - Store data in formats like key-value pairs, documents, or graphs. 200 | - Do not require fixed table schemas. 201 | - Handle unstructured or rapidly changing data. 202 | - Examples: MongoDB (document), Redis (key-value), Neo4j (graph). 203 | 204 | #### In-Memory Databases 205 | 206 | - Keep data in RAM for faster access. 207 | - Useful for caching and real-time analytics. 208 | - Example: Redis. 209 | 210 | ### Benefits of Using Databases in Applications 211 | 212 | - The **data integrity and validation features** of databases enforce rules to ensure that all information entered adheres to predefined standards and correctness. 213 | - **Transactions in databases** enable multiple operations to be executed as a single cohesive unit, ensuring consistency even if part of the operation fails. 214 | - **Backup and recovery mechanisms** are integral to databases, providing protection against data loss and enabling restoration in case of failures. 215 | - The **performance optimization techniques** in databases, such as indexing and query optimization, significantly enhance the speed and efficiency of data retrieval. 216 | 217 | -------------------------------------------------------------------------------- /notes/01_introduction_to_databases/02_types_of_databases.md: -------------------------------------------------------------------------------- 1 | ## Overview of Database Types 2 | 3 | Databases are essential tools that store, organize, and manage data for various applications. They come in different types, each designed to handle specific data models and use cases. Understanding the various database types helps in selecting the right one for your application's needs. Let's delve into the major types of databases and explore their characteristics, strengths, and suitable applications. 4 | 5 | After reading the material, you should be able to answer the following questions: 6 | 7 | 1. What are the components of a database, and how do tables, fields, records, and relationships work together to organize data? 8 | 2. What are the advantages of using databases over simpler data storage methods like text files or spreadsheets? 9 | 3. How can you perform basic SQL operations such as creating tables, inserting data, querying records, updating entries, and deleting records? 10 | 4. What are the different types of relationships between tables, and how do SQL JOIN operations facilitate the retrieval of related data across multiple tables? 11 | 5. What are the various types of databases available (e.g., relational, NoSQL, in-memory), and what are their specific use cases and benefits? 12 | 13 | ### Relational Databases (RDBMS) 14 | 15 | Relational databases store data in structured tables composed of rows and columns, similar to spreadsheets. Each table represents an entity, and relationships between these entities are established using keys. This structured approach ensures data integrity and allows complex querying through Structured Query Language (SQL). 16 | 17 | Imagine a simple database for an online store: 18 | 19 | ``` 20 | Customers Table 21 | +------------+--------------+---------------------+ 22 | | CustomerID | Name | Email | 23 | +------------+--------------+---------------------+ 24 | | 1 | Alice Smith | alice@example.com | 25 | | 2 | Bob Johnson | bob@example.com | 26 | +------------+--------------+---------------------+ 27 | 28 | Orders Table 29 | +---------+------------+------------+ 30 | | OrderID | CustomerID | OrderDate | 31 | +---------+------------+------------+ 32 | | 101 | 1 | 2023-01-15 | 33 | | 102 | 2 | 2023-01-16 | 34 | +---------+------------+------------+ 35 | ``` 36 | 37 | Here, the `CustomerID` column in the `Orders` table references the `Customers` table, establishing a relationship. 38 | 39 | #### Representative Systems 40 | 41 | - **MySQL** 42 | - **PostgreSQL** 43 | - **Oracle Database** 44 | - **Microsoft SQL Server** 45 | 46 | #### Use Cases and Strengths 47 | Relational databases excel in applications requiring structured data management and complex transaction support, such as financial systems, inventory management, and enterprise resource planning (ERP). 48 | 49 | - The system employs *structured data management* to maintain consistency and integrity across various datasets. 50 | - The platform supports *complex query support* that enables users to execute detailed SQL operations and join multiple tables effectively. 51 | - The architecture provides *reliability* by adhering to ACID properties, ensuring that transactions are processed in a dependable manner. 52 | 53 | #### Limitations 54 | 55 | - Vertical scaling can be challenging and expensive because the system exhibits *scalability constraints* that limit growth options. 56 | - Adjusting data models is often cumbersome due to the system's reliance on *rigid schemas* that restrict flexibility. 57 | - The system may experience reduced efficiency with large volumes of unstructured data as a result of *performance issues* that arise during processing. 58 | 59 | ### NoSQL Databases 60 | 61 | NoSQL databases have flexible schemas designed to handle unstructured or semi-structured data, categorized by different models optimized for specific use cases. 62 | 63 | ### Document-Based Databases 64 | 65 | Store data as JSON-like documents with flexible structures, allowing nested and varied data fields. 66 | 67 | Example: 68 | 69 | ``` 70 | { 71 | "userID": "1", 72 | "name": "Alice Smith", 73 | "email": "alice@example.com", 74 | "orders": [ 75 | {"orderID": "101", "amount": 250.00}, 76 | {"orderID": "103", "amount": 125.50} 77 | ] 78 | } 79 | ``` 80 | 81 | #### Representative Systems 82 | 83 | - **MongoDB** 84 | - **Couchbase** 85 | - **Amazon DocumentDB** 86 | 87 | #### Use Cases and Strengths 88 | Ideal for content management, product catalogs, and agile application development. 89 | 90 | - The system easily adapts to evolving data structures due to its *flexible schemas* that accommodate changes without disruption. 91 | - Users experience swift data operations as the architecture is engineered for rapid read and write capabilities, ensuring *high performance* in various workloads. 92 | - The design supports smooth expansion across servers, which underscores the system's inherent *scalability* for handling increased demand. 93 | 94 | #### Limitations 95 | 96 | - When handling operations that span multiple documents, users might encounter challenges arising from *limited complex transactions* that restrict multi-document support. 97 | - The system often struggles to execute relational queries effectively, leading to *less efficient complex queries* that impact performance. 98 | 99 | ### Key-Value Databases 100 | 101 | Simplest form, storing data as key-value pairs. 102 | 103 | Example session store: 104 | 105 | ``` 106 | "session1234": { "userID": "1", "loginTime": "2023-01-15T10:00:00Z" } 107 | ``` 108 | 109 | #### Representative Systems 110 | 111 | - **Redis** 112 | - **Amazon DynamoDB** 113 | - **Riak** 114 | 115 | #### Use Cases and Strengths 116 | Perfect for caching, session management, and real-time data processing. 117 | 118 | - Users benefit from swift operations in both reading and writing data, a design feature that underscores the system's *high throughput* capabilities. 119 | - Integration remains straightforward due to the system's *simplicity*, which provides clear and accessible APIs for developers. 120 | - The infrastructure supports effortless expansion, making *seamless scalability* an integral part of its design for handling increased workloads. 121 | 122 | #### Limitations 123 | 124 | - Users may find that the system does not support intricate querying needs, as it exhibits *limited query capabilities* that are better suited for simpler operations. 125 | - Managing complex data structures can be problematic, reflecting the inherent challenge of *challenging data modeling* in scenarios where relationships are multifaceted. 126 | 127 | ### Column-Based Databases 128 | 129 | Store data in rows with flexible columns, optimized for large-scale and time-series data. 130 | 131 | Example sensor readings: 132 | 133 | ``` 134 | Sensor ID: sensor1 135 | +-------------------+-------+ 136 | | Timestamp | Value | 137 | +-------------------+-------+ 138 | | 2023-01-15T10:00 | 20.5 | 139 | | 2023-01-15T10:05 | 21.0 | 140 | +-------------------+-------+ 141 | 142 | Sensor ID: sensor2 143 | +-------------------+-------+ 144 | | Timestamp | Value | 145 | +-------------------+-------+ 146 | | 2023-01-15T10:00 | 18.2 | 147 | | 2023-01-15T10:05 | 18.5 | 148 | +-------------------+-------+ 149 | ``` 150 | 151 | #### Representative Systems 152 | 153 | - **Apache Cassandra** 154 | - **Apache HBase** 155 | - **Google Cloud Bigtable** 156 | 157 | #### Use Cases and Strengths 158 | Suitable for analytics, logging, and handling vast volumes of time-series data. 159 | 160 | - The platform efficiently processes large volumes of data, which highlights its *high performance* in handling demanding workloads. 161 | - Its distributed architecture supports easy expansion across multiple nodes, reflecting the system's inherent *scalability*. 162 | - By utilizing adaptable data models, the design readily adjusts to evolving requirements, exemplifying its *flexible schemas*. 163 | 164 | #### Limitations 165 | 166 | - Users may find that the system does not support ad-hoc queries well, a limitation that reflects its *complex querying* challenges. 167 | - Adopting the system involves a notable shift from traditional database methods, highlighting a steep *learning curve* that users must overcome. 168 | 169 | ### Graph Databases 170 | 171 | Specialize in representing complex relationships using nodes (entities) and edges (relationships). 172 | 173 | Example relationship visualization: 174 | 175 | ``` 176 | [User: Alice]—[FRIENDS_WITH]→[User: Bob]—[LIKES]→[Post: "Graph Databases 101"] 177 | ``` 178 | 179 | #### Representative Systems 180 | 181 | - **Neo4j** 182 | - **Amazon Neptune** 183 | - **OrientDB** 184 | 185 | #### Use Cases and Strengths 186 | Ideal for social networks, recommendation engines, and applications centered on relationships. 187 | 188 | - The system is designed to navigate complex relational data, providing *efficient relationship handling* that improves query performance. 189 | - Its architecture supports modeling intricate data relationships seamlessly with an *adaptable schema* that evolves with changing requirements. 190 | - Query operations are executed rapidly, ensuring *high-performance* outcomes when processing relationship-based queries. 191 | 192 | #### Limitations 193 | 194 | - The system is best applied in specific, relationship-centric scenarios that highlight its focus on *niche use cases*. 195 | - Effective usage of the system often involves mastering specialized query languages and a detailed understanding, underscoring its inherent *complexity*. 196 | 197 | ### Choosing the Right Database 198 | 199 | Selecting the appropriate database depends on your application's requirements, including data structure, scalability needs, consistency models, and query complexity. 200 | 201 | - Relational databases are well-suited for managing *structured data* while NoSQL systems excel at handling unstructured data and dynamic schemas. 202 | - Distributed environments often leverage *Horizontal scalability*, a feature that NoSQL databases typically support more effectively. 203 | - Relational databases offer *strong consistency and robust transaction support*, which ensures reliable data integrity compared to the eventual consistency of many NoSQL solutions. 204 | - Applications requiring intricate operations benefit from *complex queries and joins*, a capability that makes relational databases more ideal for transactional scenarios. 205 | -------------------------------------------------------------------------------- /notes/01_introduction_to_databases/05_glossary.md: -------------------------------------------------------------------------------- 1 | ## Glossary of Database and SQL Terms 2 | 3 | 1. **Database**: A collection of organized data for easy access, management, and updating. 4 | 2. **Table**: A structure with rows and columns for storing data in a database. 5 | 3. **Row (Record)**: A single entry in a table with data. 6 | 4. **Column (Field)**: A category of data within a table. 7 | 5. **Primary Key**: A unique identifier for each row in a table. 8 | 6. **Foreign Key**: A key that connects one table to another by referring to the primary key of the other table. 9 | 7. **Index**: A tool that speeds up data retrieval in a database. 10 | 8. **Query**: A request to access or modify data in a database. 11 | 9. **SQL (Structured Query Language)**: A language for working with relational databases. 12 | 10. **SELECT**: An SQL command for getting data from a table. 13 | 11. **INSERT**: An SQL command for adding new data to a table. 14 | 12. **UPDATE**: An SQL command for changing existing data in a table. 15 | 13. **DELETE**: An SQL command for removing data from a table. 16 | 14. **JOIN**: An SQL operation that combines data from multiple tables based on shared columns. 17 | 15. **WHERE**: An SQL keyword for filtering data based on specific conditions. 18 | 16. **GROUP BY**: An SQL keyword for grouping rows with the same values in specified columns. 19 | 17. **ORDER BY**: An SQL keyword for sorting results based on certain columns. 20 | 18. **Schema**: The structure of a database, including tables, columns, and relationships. 21 | 19. **ACID (Atomicity, Consistency, Isolation, Durability)**: Features that ensure database transactions are reliable. 22 | 20. **RDBMS (Relational Database Management System)**: A system for managing relational databases using SQL. 23 | 21. **Constraint**: A rule for table columns to keep data accurate and consistent. 24 | 22. **UNIQUE**: A constraint that makes sure all values in a column are different. 25 | 23. **NOT NULL**: A constraint that requires a column to have a value. 26 | 24. **Check**: A constraint that forces all column values to meet a certain condition. 27 | 25. **Index**: A database object that improves the speed of data retrieval within a table. 28 | 26. **View**: A virtual table created from the results of an SQL query. 29 | 27. **Alias**: A temporary name given to a table or column in an SQL query for easier reference. 30 | 28. **TRANSACTION**: A group of SQL operations executed as a single task. 31 | 29. **COMMIT**: An SQL command for saving changes made by a transaction. 32 | 30. **ROLLBACK**: An SQL command for undoing changes made by a transaction. 33 | 31. **TRIGGER**: A stored procedure that runs automatically when an event (INSERT, UPDATE, DELETE) occurs in a table. 34 | 32. **Stored procedure**: A saved set of SQL statements in a database. 35 | 33. **Function**: A set of SQL statements with a name, input parameters, actions, and a result. 36 | 34. **Normalization**: A method for organizing data in a database to reduce redundancy and improve data integrity. 37 | 35. **Denormalization**: A process of adding redundant data to a database to speed up query performance. 38 | 36. **DDL (Data Definition Language)**: A part of SQL for creating and modifying database objects like tables and indexes. 39 | 37. **DML (Data Manipulation Language)**: A part of SQL for working with data in a database, including SELECT, INSERT, UPDATE, and DELETE. 40 | 38. **DCL (Data Control Language)**: A part of SQL for managing user access and permissions, such as GRANT and REVOKE. 41 | 39. **TCL (Transaction Control Language)**: A part of SQL for handling transactions, including COMMIT and ROLLBACK. 42 | 40. **NULL**: A special marker in SQL that indicates a data value is missing or unknown in the database. 43 | 41. **NoSQL**: A class of non-relational databases designed for handling various types of data, often providing better scalability and flexibility than traditional relational databases. 44 | 42. **CAP Theorem**: A principle stating that it is impossible for a distributed data store to simultaneously provide consistency, availability, and partition tolerance. 45 | 43. **Sharding**: The process of splitting a large database into smaller, more manageable pieces, often improving performance and scalability. 46 | 44. **Partitioning**: The practice of dividing a table into smaller, more manageable pieces based on a specific column or set of columns. 47 | 45. **Replication**: The process of copying and maintaining the same data on multiple database nodes to increase availability and fault tolerance. 48 | 46. **BASE (Basically Available, Soft State, Eventual Consistency)**: A set of attributes that describe the behavior of some distributed systems, providing a more relaxed approach to consistency compared to ACID properties. 49 | 47. **Graph Database**: A type of NoSQL database that stores data as nodes and edges in a graph, optimized for querying and traversing relationships between data points. 50 | 48. **Amazon RDS**: A managed relational database service provided by Amazon Web Services (AWS), offering support for multiple database engines, including MySQL, PostgreSQL, and Oracle. 51 | 49. **Amazon DynamoDB**: A managed NoSQL database service provided by AWS, designed for high availability, scalability, and low latency. 52 | 50. **Amazon Aurora**: A managed relational database service provided by AWS, offering compatibility with MySQL and PostgreSQL and improved performance, availability, and scalability. 53 | 51. **Caching**: Temporary storage of query results or intermediate data to speed up subsequent query executions. 54 | 52. **Horizontal Scaling**: The practice of adding more nodes to a system to handle increased workload, often used in distributed systems to improve performance and availability. 55 | 53. **Vertical Scaling**: The practice of adding more resources, such as CPU or memory, to a single node to handle increased workload. 56 | 54. **In-Memory Database**: A type of database that stores data in the main memory instead of on disk, providing faster data access and processing times. 57 | 55. **SQL Injection**: A security vulnerability that occurs when an attacker is able to insert malicious SQL code into a query, potentially compromising the database or exposing sensitive data. 58 | 56. **ETL (Extract, Transform, Load)**: A process used to collect, clean, and move data from one or more sources to a data warehouse or another data store. 59 | 57. **OLTP (Online Transaction Processing)**: A class of systems designed for managing transactional workloads, such as inserting, updating, and deleting records. 60 | 58. **OLAP (Online Analytical Processing)**: A class of systems designed for managing analytical workloads, such as complex queries and aggregations. 61 | 59. **Data Warehousing**: A large-scale data storage solution optimized for storing, managing, and analyzing large amounts of historical data from various sources. 62 | 60. **Big Data**: A term referring to the massive volume, variety, and velocity of data generated by modern applications and devices, often requiring specialized tools and techniques for processing and analysis. 63 | 61. **Hadoop**: An open-source framework for distributed storage and processing of large datasets using the MapReduce programming model. 64 | 62. **MapReduce**: A programming model for processing and generating large data sets in parallel across a distributed computing environment. 65 | 63. **Apache Spark**: An open-source distributed data processing engine designed for high-performance, large-scale data processing and machine learning tasks. 66 | 64. **Apache Cassandra**: A highly scalable, distributed NoSQL database designed for handling large amounts of data across many nodes, providing high availability and fault tolerance. 67 | 65. **Elasticsearch**: An open-source, distributed search and analytics engine built on Apache Lucene, used for indexing and searching large volumes of data. 68 | -------------------------------------------------------------------------------- /notes/02_database_design/03_denormalization.md: -------------------------------------------------------------------------------- 1 | ## Denormalization in Databases 2 | 3 | Denormalization might seem counterintuitive, especially if you're familiar with the principles of normalization that aim to reduce redundancy and dependency in databases. However, denormalization is a strategic process where we intentionally introduce redundancy into a database design. This approach can enhance read performance and simplify complex queries, making it a valuable technique in certain scenarios. 4 | 5 | After reading the material, you should be able to answer the following questions: 6 | 7 | 1. What is denormalization in databases, and how does it differ from normalization? 8 | 2. Why might denormalization be necessary, and in what scenarios is it most beneficial? 9 | 3. What are the common denormalization techniques, such as adding redundant columns, precomputing aggregate values, duplicating tables, and creating denormalized data structures? 10 | 4. What are the primary benefits and drawbacks of implementing denormalization in a database design? 11 | 5. What best practices should be followed to ensure data integrity and maintain performance when denormalizing a database? 12 | 13 | ### Understanding Denormalization 14 | 15 | At its core, denormalization involves combining data from multiple tables into a single table. This reduces the need for costly join operations during data retrieval, which can significantly speed up query performance. Imagine a library where all the information about a book—its title, author, genre, and availability—is stored in one card rather than scattered across multiple indexes. This makes it quicker to find all the information you need without flipping through several files. 16 | 17 | #### Why Denormalize? 18 | 19 | The primary motivation for denormalization is to improve read performance and query efficiency. In systems where read operations are much more frequent than write operations, denormalization can reduce the complexity of data retrieval. By having related data in a single table, the database can fetch all necessary information with fewer operations. 20 | 21 | However, denormalization involves certain trade-offs: 22 | 23 | - Increased redundancy occurs because data is duplicated across the database, leading to higher storage requirements. 24 | - The risk of data inconsistency rises since multiple copies of the same data may not always be updated correctly. 25 | - Write operations become more complex, as insertions, updates, and deletions must be reflected consistently across multiple locations. 26 | 27 | #### When to Consider Denormalization 28 | 29 | Denormalization proves particularly useful in specific scenarios: 30 | 31 | - It is beneficial when performance bottlenecks arise, and analysis reveals that join operations are significantly slowing down the database. 32 | - In systems with a high read-to-write ratio, the advantages of faster reads often outweigh the challenges of managing more complex write operations. 33 | - Simplifying complex queries becomes advantageous when multiple joins make queries slow, complicated, and difficult to maintain. 34 | 35 | ### Denormalization Techniques 36 | 37 | There are several strategies for denormalizing a database: 38 | 39 | #### Adding Redundant Columns 40 | 41 | This involves adding a column to a table that duplicates data from a related table. For example, adding a customer's address directly to the orders table so that it doesn't need to be fetched from a separate customers table during order processing. 42 | 43 | #### Precomputing Aggregate Values 44 | 45 | Storing computed values, like totals or counts, can save time on queries that would otherwise have to calculate these values on the fly. For instance, keeping a running total of sales in a summary table. 46 | 47 | #### Duplicate Tables 48 | 49 | Maintaining multiple copies of a table tailored for different types of queries can improve performance. One table might be optimized for reading, while another is optimized for writing. 50 | 51 | #### Denormalized Data Structures 52 | 53 | Creating structures like star schemas or fact tables in data warehousing, where normalized data is restructured to optimize for query performance. 54 | 55 | ### An Example of Denormalization 56 | 57 | Let's consider a database that manages suppliers, parts, and projects. In a fully normalized design, you might have separate tables for suppliers, parts, and projects, linked through foreign keys. Fetching all the details about which suppliers are involved in which projects requires joining these tables. 58 | 59 | #### Normalized Tables 60 | 61 | **Suppliers Table** 62 | 63 | | SupplierID | SupplierName | 64 | |------------|--------------| 65 | | S1 | Supplier A | 66 | | S2 | Supplier B | 67 | | S3 | Supplier C | 68 | 69 | **Parts Table** 70 | 71 | | PartID | PartName | 72 | |--------|----------| 73 | | P1 | Part X | 74 | | P2 | Part Y | 75 | | P3 | Part Z | 76 | 77 | **Projects Table** 78 | 79 | | ProjectID | ProjectName | 80 | |-----------|-------------| 81 | | J1 | Project Alpha | 82 | | J2 | Project Beta | 83 | | J3 | Project Gamma | 84 | 85 | **Supplier_Part_Project Table** 86 | 87 | | SupplierID | PartID | ProjectID | 88 | |------------|--------|-----------| 89 | | S1 | P1 | J1 | 90 | | S1 | P2 | J1 | 91 | | S2 | P1 | J2 | 92 | | S2 | P3 | J2 | 93 | | S3 | P1 | J3 | 94 | 95 | Retrieving information about suppliers for a specific project involves joining multiple tables, which can be inefficient for large datasets. 96 | 97 | #### Denormalized Table 98 | 99 | By denormalizing, we can combine the data into a single table: 100 | 101 | **Supplier_Part_Project_Denorm Table** 102 | 103 | | SupplierID | SupplierName | PartID | PartName | ProjectID | ProjectName | 104 | |------------|--------------|--------|----------|-----------|-------------| 105 | | S1 | Supplier A | P1 | Part X | J1 | Project Alpha | 106 | | S1 | Supplier A | P2 | Part Y | J1 | Project Alpha | 107 | | S2 | Supplier B | P1 | Part X | J2 | Project Beta | 108 | | S2 | Supplier B | P3 | Part Z | J2 | Project Beta | 109 | | S3 | Supplier C | P1 | Part X | J3 | Project Gamma | 110 | 111 | With all relevant data in one table, queries become simpler and faster because they no longer require joins across multiple tables. 112 | 113 | ### Benefits and Drawbacks 114 | 115 | #### Benefits 116 | 117 | - Improved read performance is achieved as data retrieval becomes faster due to the elimination of complex joins. 118 | - Queries are simpler, making them easier to write and maintain compared to normalized structures. 119 | - Denormalized structures are better suited for reporting and analytics, allowing efficient data aggregation. 120 | 121 | #### Drawbacks 122 | 123 | - Data redundancy increases, leading to higher storage requirements and a potential for inconsistency. 124 | - Updates become more complex as modifications must be reflected across multiple redundant copies of data. 125 | - Maintenance overhead rises, requiring additional effort to ensure the integrity of the denormalized data. 126 | 127 | ### Best Practices for Denormalization 128 | 129 | When implementing denormalization, it is essential to adhere to these best practices to balance performance improvements and potential risks effectively: 130 | 131 | #### Careful Planning 132 | 133 | - It is important to identify performance bottlenecks by using profiling tools to pinpoint slow queries caused by complex joins. 134 | - Denormalization efforts should target specific areas of the database that will gain the most performance improvements. 135 | 136 | #### Ensuring Data Integrity 137 | 138 | - Automating the synchronization of redundant data through triggers or stored procedures helps reduce the risk of inconsistencies. 139 | - Enforcing data integrity rules is achievable by implementing database constraints wherever applicable. 140 | 141 | #### Monitoring and Adjusting 142 | 143 | - Regular reviews of performance are crucial to understand the impact of denormalization on both read and write operations. 144 | - Adjustments should be made as needed, including further denormalization or reverting changes based on observed performance metrics. 145 | 146 | #### Documenting Changes 147 | 148 | - Keeping detailed records of all denormalization changes and their justifications is essential for maintaining transparency. 149 | - Ensuring the entire team is informed about denormalized structures prevents misunderstandings during development and maintenance. 150 | 151 | ### Denormalization in Modern Databases 152 | 153 | With the advent of NoSQL databases and distributed systems, denormalization has become more prevalent. Many NoSQL databases are designed with denormalization in mind, prioritizing read performance and scalability over strict normalization. 154 | 155 | #### Denormalization in NoSQL Databases 156 | 157 | - **Document Stores** use databases such as MongoDB, which promote storing related data together in documents, effectively implementing denormalization. 158 | - **Key-Value Stores** access data through a single key, often necessitating data duplication to accommodate different access patterns. 159 | 160 | #### Trade-offs in NoSQL 161 | 162 | While NoSQL databases offer flexibility and performance benefits, they also require careful handling of data consistency and integrity, much like traditional databases that have been denormalized. 163 | -------------------------------------------------------------------------------- /notes/03_sql/05_transaction_control_language_tcl.md: -------------------------------------------------------------------------------- 1 | ## Transaction Control Language (TCL) 2 | 3 | In the world of databases, maintaining data integrity and consistency is crucial, especially when multiple operations are involved. Imagine you're at a bank's ATM, transferring money from your savings to your checking account. You wouldn't want the system to deduct the amount from your savings without adding it to your checking due to some error, right? This is where Transaction Control Language (TCL) comes into play, ensuring that all related operations either complete successfully together or fail without affecting the database's consistency. 4 | 5 | ### Understanding Transactions 6 | 7 | A transaction is a sequence of one or more SQL statements that are executed as a single unit of work. The primary goal is to ensure that either all operations within the transaction are completed successfully or none are, preserving the database's integrity. 8 | 9 | #### The ACID Properties 10 | 11 | Transactions adhere to the ACID properties: 12 | 13 | - **Atomicity** ensures that all operations within a transaction are completed as a single unit; if any operation fails, the entire transaction is aborted and no changes are applied. 14 | - **Consistency** guarantees that a transaction transitions the database from one valid state to another while adhering to all defined integrity constraints and rules. 15 | - **Isolation** ensures that transactions executing concurrently do not interfere with each other, preserving the correctness of operations. 16 | - **Durability** ensures that once a transaction is committed, its changes are permanently recorded and persist even in the event of a system failure. 17 | 18 | ### TCL Commands 19 | 20 | TCL provides several commands to manage transactions effectively: 21 | 22 | - `BEGIN TRANSACTION` 23 | - `COMMIT` 24 | - `ROLLBACK` 25 | - `SAVEPOINT` 26 | - `ROLLBACK TO SAVEPOINT` 27 | 28 | Let's delve into each of these commands with examples to understand how they work. 29 | 30 | #### BEGIN TRANSACTION 31 | 32 | Starting a transaction is like saying to the database, "I'm about to perform several operations that should be treated as a single, indivisible unit." 33 | 34 | ```sql 35 | BEGIN TRANSACTION; 36 | ``` 37 | 38 | After this command, all subsequent operations are part of the transaction until it's either committed or rolled back. 39 | 40 | #### COMMIT 41 | 42 | The `COMMIT` command saves all changes made during the transaction to the database permanently. 43 | 44 | **Example Scenario:** 45 | 46 | Suppose we have an `employees` table and want to increase the salary of all employees in department 1 by 10%. 47 | 48 | **Employees Table Before:** 49 | 50 | | employee_id | department_id | salary | 51 | |-------------|---------------|--------| 52 | | 1 | 1 | 1000 | 53 | | 2 | 1 | 1200 | 54 | | 3 | 2 | 1500 | 55 | 56 | **SQL Commands:** 57 | 58 | ```sql 59 | BEGIN TRANSACTION; 60 | 61 | UPDATE employees 62 | SET salary = salary * 1.10 63 | WHERE department_id = 1; 64 | 65 | COMMIT; 66 | ``` 67 | 68 | **Employees Table After:** 69 | 70 | | employee_id | department_id | salary | 71 | |-------------|---------------|--------| 72 | | 1 | 1 | 1100 | 73 | | 2 | 1 | 1320 | 74 | | 3 | 2 | 1500 | 75 | 76 | **Interpretation:** 77 | 78 | - The transaction starts. 79 | - Salaries for department 1 employees are updated. 80 | - `COMMIT` saves these changes permanently. 81 | 82 | #### ROLLBACK 83 | 84 | If something goes wrong during a transaction, you can undo all changes made within it using `ROLLBACK`. 85 | 86 | **Example Scenario:** 87 | 88 | We attempt the same salary update but realize there's a mistake before committing. 89 | 90 | ```sql 91 | BEGIN TRANSACTION; 92 | 93 | UPDATE employees 94 | SET salary = salary * 1.10 95 | WHERE department_id = 1; 96 | 97 | -- Oops! Realized we should only increase by 5% 98 | ROLLBACK; 99 | ``` 100 | 101 | **Employees Table After Rollback:** 102 | 103 | | employee_id | department_id | salary | 104 | |-------------|---------------|--------| 105 | | 1 | 1 | 1000 | 106 | | 2 | 1 | 1200 | 107 | | 3 | 2 | 1500 | 108 | 109 | **Interpretation:** 110 | 111 | - The transaction starts. 112 | - Salaries are updated incorrectly. 113 | - `ROLLBACK` undoes the changes, restoring the original salaries. 114 | 115 | #### SAVEPOINT 116 | 117 | A savepoint allows you to set a point within a transaction to which you can later roll back, without affecting the entire transaction. 118 | 119 | **Example Scenario:** 120 | 121 | We decide to update salaries in two departments but want the option to undo the second update without losing the first. 122 | 123 | ```sql 124 | BEGIN TRANSACTION; 125 | 126 | UPDATE employees 127 | SET salary = salary * 1.10 128 | WHERE department_id = 1; 129 | 130 | SAVEPOINT dept1_updated; 131 | 132 | UPDATE employees 133 | SET salary = salary * 1.05 134 | WHERE department_id = 2; 135 | ``` 136 | 137 | **Employees Table After Updates:** 138 | 139 | | employee_id | department_id | salary | 140 | |-------------|---------------|--------| 141 | | 1 | 1 | 1100 | 142 | | 2 | 1 | 1320 | 143 | | 3 | 2 | 1575 | 144 | 145 | **Interpretation:** 146 | 147 | - Salaries in department 1 are increased by 10%. 148 | - A savepoint named `dept1_updated` is created. 149 | - Salaries in department 2 are increased by 5%. 150 | 151 | #### ROLLBACK TO SAVEPOINT 152 | 153 | If we decide to undo the changes made after a savepoint, we can roll back to it. 154 | 155 | ```sql 156 | ROLLBACK TO dept1_updated; 157 | 158 | COMMIT; 159 | ``` 160 | 161 | **Employees Table After Rollback to Savepoint and Commit:** 162 | 163 | | employee_id | department_id | salary | 164 | |-------------|---------------|--------| 165 | | 1 | 1 | 1100 | 166 | | 2 | 1 | 1320 | 167 | | 3 | 2 | 1500 | 168 | 169 | **Interpretation:** 170 | 171 | - Changes made after `dept1_updated` are undone. 172 | - The salary increase for department 2 is rolled back. 173 | - `COMMIT` saves the salary increase for department 1. 174 | 175 | #### Full Transaction Flow 176 | 177 | Here's the entire process in one go: 178 | 179 | ```sql 180 | BEGIN TRANSACTION; 181 | 182 | UPDATE employees 183 | SET salary = salary * 1.10 184 | WHERE department_id = 1; 185 | 186 | SAVEPOINT dept1_updated; 187 | 188 | UPDATE employees 189 | SET salary = salary * 1.05 190 | WHERE department_id = 2; 191 | 192 | -- Decide to undo the last update 193 | ROLLBACK TO dept1_updated; 194 | 195 | COMMIT; 196 | ``` 197 | 198 | ### Transactions in Real Life 199 | 200 | Transactions are essential in scenarios where multiple operations need to be treated atomically. 201 | 202 | #### Banking Example 203 | 204 | Imagine transferring $500 from Account A to Account B. 205 | 206 | ```sql 207 | BEGIN TRANSACTION; 208 | 209 | UPDATE accounts 210 | SET balance = balance - 500 211 | WHERE account_id = 'A'; 212 | 213 | UPDATE accounts 214 | SET balance = balance + 500 215 | WHERE account_id = 'B'; 216 | 217 | COMMIT; 218 | ``` 219 | 220 | If any part of this transaction fails (e.g., insufficient funds in Account A), a `ROLLBACK` ensures neither account balance is changed, maintaining financial integrity. 221 | 222 | ### Rollback Capabilities Across Databases 223 | 224 | Different databases handle transactions in slightly different ways. Here's a comparison: 225 | 226 | | Feature | PostgreSQL | MySQL | Oracle | SQL Server | 227 | |--------------------------|------------|---------------|---------------|---------------| 228 | | Transactions | Yes | Yes | Yes | Yes | 229 | | Rollback Support | Yes | Yes | Yes | Yes | 230 | | Savepoints | Yes | Yes | Yes | Yes | 231 | | DML Rollback | Yes | Yes | Yes | Yes | 232 | | DDL Rollback | Limited | Limited | No | Limited | 233 | | Autocommit Default | Off | On | Off | On | 234 | | Isolation Levels | Multiple | Multiple | Multiple | Multiple | 235 | 236 | - Data Manipulation Language (DML) statements such as `INSERT`, `UPDATE`, and `DELETE` can be rolled back in all databases, ensuring changes are not finalized unless explicitly committed. 237 | - Support for rolling back Data Definition Language (DDL) statements like `CREATE`, `ALTER`, and `DROP` varies between databases, as some do not permit rolling back these operations. 238 | - Autocommit behavior in databases such as MySQL and SQL Server automatically commits changes unless a transaction is explicitly initiated, requiring careful handling to avoid unintended permanent changes. 239 | 240 | ### Best Practices for Using Transactions 241 | 242 | - Transactions group related operations to ensure all changes are either committed together or rolled back as a unit. 243 | - Keeping transactions short minimizes resource locks and helps maintain system performance. 244 | - Error handling should include a `ROLLBACK` mechanism to revert changes in case of failures during the transaction. 245 | - Savepoints can be used effectively in complex transactions to allow partial rollbacks, but they may introduce additional overhead. 246 | - Understanding and selecting the appropriate isolation level helps balance the trade-off between performance and data integrity. 247 | -------------------------------------------------------------------------------- /notes/04_acid_properties_and_transactions/01_transactions_intro.md: -------------------------------------------------------------------------------- 1 | ## What Is a Transaction? 2 | 3 | A database transaction is a sequence of operations performed as a single, indivisible unit of work. These operations—such as inserting, updating, or deleting records—are executed together to ensure data integrity and consistency, especially when multiple users or processes access the database at the same time. 4 | 5 | ``` 6 | 1. Initial State: 7 | 8 | ┌───────────────────────┐ ┌───────────────────────┐ 9 | │ Account A │ │ Account B │ 10 | │ │ │ │ 11 | │ Balance: $100 │ │ Balance: $50 │ 12 | └───────────────────────┘ └───────────────────────┘ 13 | 14 | │ │ 15 | │ │ 16 | ▼ ▼ 17 | 18 | 2. Transaction Initiated: 19 | 20 | ┌───────────────────────────────────────────────────────────┐ 21 | │ Transferring $20 from Account A │ 22 | │ to Account B │ 23 | └───────────────────────────────────────────────────────────┘ 24 | 25 | │ │ 26 | ▼ ▼ 27 | 28 | 3. After Transaction: 29 | 30 | ┌───────────────────────┐ ┌───────────────────────┐ 31 | │ Account A │ │ Account B │ 32 | │ │ │ │ 33 | │ Balance: $80 │ │ Balance: $70 │ 34 | └───────────────────────┘ └───────────────────────┘ 35 | ``` 36 | 37 | In the example above, the transaction involves transferring $20 from Account A to Account B. If any part of this transaction fails—say, if the system crashes after debiting Account A but before crediting Account B—the transaction management system ensures that all changes are rolled back, returning the database to its initial state. 38 | 39 | After reading the material, you should be able to answer the following questions: 40 | 41 | 1. What is a database transaction, and why is it important for maintaining data integrity and consistency in a database system? 42 | 2. What are the ACID properties of transactions, and how does each property (Atomicity, Consistency, Isolation, Durability) contribute to reliable transaction processing? 43 | 3. How does the post office analogy illustrate the principles of Atomicity, Consistency, Isolation, and Durability in database transactions? 44 | 4. What are the key components and operations involved in transaction management, including statements like Begin Transaction, Commit, and Rollback? 45 | 5. How do different concurrency control mechanisms, such as optimistic and pessimistic concurrency control, help manage simultaneous transactions and prevent conflicts in a database? 46 | 47 | ### ACID Properties 48 | 49 | Transactions in databases follow the **ACID** properties—Atomicity, Consistency, Isolation, and Durability—to ensure reliability, correctness, and robustness, even during errors or system failures. 50 | 51 | #### Atomicity 52 | 53 | Atomicity guarantees that a transaction is treated as a single, indivisible unit. Either all operations within the transaction succeed, or none do. If any operation within a transaction fails, all previously executed steps are reversed. 54 | 55 | ``` 56 | Transaction Example: 57 | 58 | Initial State: 59 | Account A: $100 60 | Account B: $50 61 | 62 | Transaction Steps: 63 | 1. Debit $20 from Account A (Account A: $80) 64 | 2. Credit $20 to Account B (Account B: $70) 65 | 66 | If Step 2 fails: 67 | Rollback Step 1 → Account A returns to $100 68 | ``` 69 | 70 | *Atomicity prevents partial updates, preserving database consistency.* 71 | 72 | #### Consistency 73 | 74 | Consistency ensures that transactions transition the database from one valid state to another valid state, following all rules, constraints, and triggers defined in the database. 75 | 76 | ``` 77 | Transaction Example: 78 | 79 | Initial State: 80 | Account A: $100 81 | Account B: $50 82 | 83 | After Transaction: 84 | Account A: $80 85 | Account B: $70 86 | 87 | Total balance remains consistent: 88 | Before: $150 | After: $150 89 | ``` 90 | 91 | *Consistency maintains data integrity throughout transactions.* 92 | 93 | #### Isolation 94 | 95 | Isolation ensures concurrent transactions operate independently, without affecting each other. Transactions run as if they are executed sequentially, preventing intermediate states from being visible to other concurrent transactions. 96 | 97 | ``` 98 | Isolation Example: 99 | 100 | Transaction T1: 101 | Reads Account A → Updates Account A 102 | 103 | Transaction T2: 104 | Reads Account B → Updates Account B 105 | 106 | Even if T1 and T2 execute simultaneously: 107 | T1 ↔ Account A (isolated) 108 | T2 ↔ Account B (isolated) 109 | 110 | No interference between transactions. 111 | ``` 112 | 113 | *Isolation prevents transactions from causing conflicts or inconsistency.* 114 | 115 | #### Durability 116 | 117 | Durability guarantees that once a transaction is committed, its effects are permanently saved, even if a system failure occurs immediately afterward. The committed state is stored on durable, non-volatile storage. 118 | 119 | ``` 120 | Durability Example: 121 | 122 | Transaction Commit: 123 | → Changes saved permanently to disk. 124 | 125 | System Crash Occurs: 126 | → Restart System 127 | 128 | After Recovery: 129 | → Committed changes still present. 130 | ``` 131 | 132 | *Durability ensures permanent recording of committed transactions.* 133 | 134 | ### Analogy of a post office 135 | 136 | Once upon a time in a small village, there was a dedicated postman named Tom. Tom's job was to ensure all letters sent from the village post office reached their intended recipients safely and quickly. 137 | 138 | One day, Tom received a special request. A villager named Alice wanted to send two important letters—one to her friend Bob and another to her cousin Charlie. Tom learned these letters were part of a surprise birthday celebration. Alice made it clear that both letters had to arrive together; otherwise, the surprise would be spoiled. Understanding this, Tom promised Alice that either both letters would be delivered or neither would leave the post office. This clearly demonstrated the principle of ATOMICITY, where tasks must fully complete or not occur at all. 139 | 140 | The village post office had strict rules for handling letters: each must be stamped, sealed, and properly addressed. Tom carefully checked Alice's letters and found one missing a stamp. Knowing the importance of following the rules, he held both letters back until the issue was resolved. This careful approach ensured the postal service's reliability, highlighting CONSISTENCY—making sure every action follows set standards. 141 | 142 | While Tom was working on Alice's letters, the post office was busy with other activities. Villagers like Dave were sending packages and receiving letters at the same time. Tom skillfully managed multiple tasks, ensuring each delivery was handled independently. Even though he was multitasking, Alice's letters and Dave's package were treated separately without interfering with each other. This demonstrated the principle of ISOLATION, where tasks carried out simultaneously do not affect each other negatively. 143 | 144 | Eventually, Alice fixed the stamp issue. Once both letters were ready, Tom sent them out for delivery. Once dispatched, the action became permanent and couldn't be reversed. Tom recorded the details in the official logbook, ensuring clear documentation. Even if issues like bad weather or mechanical problems arose, the post office had ways to ensure Bob and Charlie would eventually receive their letters. This illustrated DURABILITY, meaning that once an action is complete, it stays permanent and secure, just like committed database transactions. 145 | 146 | ### Transaction Management 147 | 148 | Managing transactions involves coordinating their execution to uphold the ACID properties. This ensures that the database remains reliable and consistent, even when multiple transactions occur concurrently. 149 | 150 | - To start a transaction, the *Begin Transaction* statement is executed, marking subsequent operations as part of a single unit of work. 151 | - When finalizing a transaction, the process involves a *Commit* operation that permanently saves every change made. 152 | - In case of an error or explicit cancellation, the *Rollback* mechanism reverts the database to its state before the transaction began. 153 | - Multiple transactions can run simultaneously without conflict through the use of *Concurrency Control* mechanisms, which maintain data consistency and isolation. 154 | - Varying degrees of isolation are provided by *Isolation Levels*, ranging from Read Uncommitted and Read Committed to Repeatable Read and Serializable. 155 | - Data integrity is safeguarded by the *Atomicity* property, which requires that transactions are executed completely or not at all. 156 | - As a transaction progresses, *Consistency* ensures that the database smoothly transitions between valid states while adhering to all defined rules and constraints. 157 | - Once a transaction is committed, the *Durability* principle guarantees that its changes persist even if a system failure occurs. 158 | - Resource access is regulated by *Locking Mechanisms* such as shared and exclusive locks, preventing conflicts during concurrent transactions. 159 | - Sometimes, *Deadlocks* can emerge when transactions wait indefinitely for each other's resources, which necessitates database intervention to resolve the issue. 160 | - Under the assumption that conflicts are rare, *Optimistic Concurrency Control* checks data integrity at the commit stage rather than locking resources during execution. 161 | - In contrast, *Pessimistic Concurrency Control* proactively employs locks to ensure data isn’t altered by another transaction until the current one is completed. 162 | - Within a transaction, creating a *Savepoint* allows for partial rollbacks, enabling recovery to a particular state without undoing the entire process. 163 | - In distributed environments, the *Two-Phase Commit (2PC)* protocol ensures that every participating node agrees on the transaction's commit, enhancing overall reliability. 164 | - A detailed log, known as a *Transaction Log*, records all changes during a transaction, which supports effective recovery in the event of a failure. 165 | - Common *Read Phenomena* such as dirty reads, non-repeatable reads, and phantom reads are managed by adjusting isolation levels to balance both performance and consistency. 166 | - Lastly, *Database Management Systems (DBMS)* uphold the ACID properties—Atomicity, Consistency, Isolation, and Durability—to ensure that transaction processing remains reliable and robust. 167 | -------------------------------------------------------------------------------- /notes/04_acid_properties_and_transactions/02_atomicity.md: -------------------------------------------------------------------------------- 1 | ## Atomicity in Database Transactions 2 | 3 | Atomicity is a fundamental principle in database systems that ensures each transaction is processed as an indivisible unit. This means that all operations within a transaction must be completed successfully for the transaction to be committed to the database. If any operation fails, the entire transaction is rolled back, leaving the database unchanged. This "all-or-nothing" approach is crucial for maintaining data integrity and consistency. 4 | 5 | Imagine a transaction as a series of steps that are tightly bound together. If one step fails, the entire sequence is aborted to prevent partial updates that could corrupt the database. 6 | 7 | ``` 8 | +---------------------------------+ 9 | | Transaction Steps | 10 | | | 11 | | Step 1: Validate Input | 12 | | Step 2: Update Records | 13 | | Step 3: Write to Log | 14 | | Step 4: Commit Changes | 15 | +---------------------------------+ 16 | ``` 17 | 18 | After reading the material, you should be able to answer the following questions: 19 | 20 | 1. What is atomicity in database transactions, and how does it ensure that transactions are processed as indivisible units of work? 21 | 2. Why is atomicity important for preserving data integrity and simplifying error handling in database systems? 22 | 3. How does the Two-Phase Commit Protocol (2PC) facilitate atomicity in distributed database environments? 23 | 4. What are savepoints in transactions, and how do they help manage partial rollbacks while maintaining atomicity? 24 | 5. How can atomicity be implemented and managed in SQL transactions, particularly in scenarios like transferring funds between accounts? 25 | 26 | ### The Importance of Atomicity 27 | 28 | Atomicity plays a vital role in database transactions by ensuring that partial transactions do not leave the database in an inconsistent state. This is especially important in systems where multiple transactions are occurring simultaneously. 29 | 30 | #### Preserving Data Integrity 31 | 32 | By treating transactions as indivisible units, atomicity prevents scenarios where only some parts of a transaction are applied. This means the database remains accurate and reliable, reflecting only complete sets of operations. 33 | 34 | #### Simplifying Error Handling 35 | 36 | Atomicity simplifies the process of dealing with errors during transaction execution. Developers and database administrators can rely on the database system to automatically roll back incomplete transactions, reducing the need for complex error recovery logic. 37 | 38 | ### Real-World Examples 39 | 40 | To better understand atomicity, let's explore some real-world scenarios where this concept is essential. 41 | 42 | #### Bank Account Transfers 43 | 44 | Consider the process of transferring money between two bank accounts. This transaction involves debiting one account and crediting another. Both actions must occur together; otherwise, funds could be lost or erroneously created. 45 | 46 | - A **Complete Transaction** ensures that $500 is properly debited from Account A and credited to Account B, maintaining balance and data integrity. 47 | - In a **Failure Scenario**, if the debit operation succeeds but the credit operation fails, the system could lose $500, creating a discrepancy in the accounts. 48 | 49 | Atomicity ensures that either both accounts are updated or neither is, preserving the integrity of the bank's records. 50 | 51 | #### Online Shopping Orders 52 | 53 | When placing an order online, several operations happen behind the scenes: payment processing, inventory reduction, and order confirmation. If payment processing fails, the system should not reduce inventory or generate an order confirmation. 54 | 55 | - In a **Successful Transaction**, the system processes the payment, updates the inventory to reflect the sold item, and sends a confirmation to the customer, completing the workflow. 56 | - A **Failure Scenario** arises when one operation, such as payment processing, fails while another, like inventory reduction, is executed, leading to inconsistencies such as inaccurate stock levels. 57 | 58 | Atomicity ensures that all steps are completed together, maintaining consistency in the system. 59 | 60 | ### Implementing Atomicity 61 | 62 | To achieve atomicity, database systems employ various techniques and protocols that manage transactions effectively. 63 | 64 | #### Two-Phase Commit Protocol (2PC) 65 | 66 | In distributed database systems, the Two-Phase Commit Protocol ensures that all participating databases agree on committing or rolling back a transaction. 67 | 68 | - In the **Prepare Phase**, the transaction coordinator requests all participants to confirm whether they are ready to commit the transaction, ensuring all conditions for a successful commit are met. 69 | - During the **Commit Phase**, the coordinator instructs participants to finalize the transaction if all have agreed to commit; otherwise, a rollback command is issued to undo changes if any participant cannot commit. 70 | 71 | This protocol ensures that either all databases commit the transaction or all roll it back, maintaining atomicity across the system. 72 | 73 | ``` 74 | Coordinator 75 | | 76 | +-- Prepare --> Participant 1 (Ready) 77 | +-- Prepare --> Participant 2 (Ready) 78 | +-- Prepare --> Participant 3 (Ready) 79 | | 80 | +-- Commit --> All Participants 81 | ``` 82 | 83 | #### Savepoints in Transactions 84 | 85 | Savepoints provide a way to partition a transaction into smaller segments. They allow partial rollbacks within a transaction without aborting the entire sequence. 86 | 87 | - Use `SAVEPOINT savepoint_name;` to mark a point within a transaction. 88 | - Use `ROLLBACK TO savepoint_name;` to undo operations back to the savepoint. 89 | - Use `RELEASE SAVEPOINT savepoint_name;` to remove the savepoint. 90 | 91 | Savepoints are useful in complex transactions where certain operations may fail, but earlier successful operations should be retained. 92 | 93 | #### Log-Based Recovery 94 | 95 | Databases use logs to record all changes made during transactions. This approach allows the system to undo or redo transactions in case of failures. 96 | 97 | - In **Write-Ahead Logging**, all changes are first recorded in a log file before being applied to the database, ensuring a reliable mechanism for recovery. 98 | - During the **Recovery Process**, the database utilizes the log file to identify incomplete transactions after a failure and rolls them back to maintain atomicity and consistency. 99 | 100 | This mechanism is essential for maintaining data integrity, especially in systems where transactions are frequently interrupted. 101 | 102 | ### Atomicity in SQL Transactions 103 | 104 | In SQL, transactions are managed using commands that explicitly define the start and end of a transaction. 105 | 106 | #### Basic Transaction Commands 107 | 108 | - `BEGIN TRANSACTION;` marks the start. 109 | - `COMMIT;` saves all changes. 110 | - `ROLLBACK;` undoes all changes since the transaction began. 111 | 112 | #### Example: Transferring Funds Between Accounts 113 | 114 | ```sql 115 | BEGIN TRANSACTION; 116 | 117 | UPDATE accounts 118 | SET balance = balance - 500 119 | WHERE account_id = 1; 120 | 121 | UPDATE accounts 122 | SET balance = balance + 500 123 | WHERE account_id = 2; 124 | 125 | COMMIT; 126 | ``` 127 | 128 | In this example, if either `UPDATE` statement fails, a `ROLLBACK;` command would undo any changes, thanks to the atomicity of the transaction. 129 | 130 | #### Using Savepoints 131 | 132 | ```sql 133 | BEGIN TRANSACTION; 134 | 135 | SAVEPOINT before_update; 136 | 137 | UPDATE inventory 138 | SET quantity = quantity - 1 139 | WHERE product_id = 101; 140 | 141 | -- Suppose an error occurs here 142 | IF ERROR 143 | BEGIN 144 | ROLLBACK TO before_update; 145 | END 146 | 147 | COMMIT; 148 | ``` 149 | 150 | By rolling back to the savepoint, the transaction undoes changes made after the savepoint without affecting earlier operations. 151 | 152 | #### Visualizing Transaction Flow 153 | 154 | To really understand **atomicity** in databases, it's useful to visualize what happens during a transaction. Think of a transaction as a sequence of steps that must either **all succeed or none at all**—no in-between. 155 | 156 | Here’s a simple diagram showing the *normal flow* of a transaction: 157 | 158 | ``` 159 | [Start Transaction] 160 | | 161 | [Operation 1] 162 | | 163 | [Operation 2] 164 | | 165 | [Check for Errors] 166 | | 167 | [No Errors] 168 | | 169 | [Commit] 170 | ``` 171 | 172 | This is the “happy path.” You start a transaction, do your operations, check for any issues, and if nothing’s wrong, you commit the changes. Committing makes everything permanent in the database. 173 | 174 | But if **any** operation fails, you don’t go forward—you go back. That’s the whole point of atomicity. You either do it all or undo it all. 175 | 176 | Let’s see what that looks like: 177 | 178 | ``` 179 | [Start Transaction] <--- 180 | | | 181 | [Operation 1] | 182 | | | 183 | [Operation 2] | 184 | | | 185 | [Error Detected] | 186 | | | 187 | [Rollback] --------- 188 | ``` 189 | 190 | Here’s how this might look in actual SQL (using PostgreSQL syntax): 191 | 192 | ```sql 193 | BEGIN; 194 | 195 | UPDATE accounts SET balance = balance - 100 WHERE id = 1; 196 | 197 | UPDATE accounts SET balance = balance + 100 WHERE id = 2; 198 | 199 | COMMIT; 200 | ``` 201 | 202 | This is transferring $100 from account 1 to account 2. Simple enough. 203 | 204 | Now, let’s simulate an error. Say the second `UPDATE` fails—maybe account 2 doesn’t exist. We’d use this approach to protect data integrity: 205 | 206 | ```sql 207 | BEGIN; 208 | 209 | UPDATE accounts SET balance = balance - 100 WHERE id = 1; 210 | 211 | -- Suppose this line fails: 212 | UPDATE accounts SET balance = balance + 100 WHERE id = 999; 213 | 214 | ROLLBACK; 215 | ``` 216 | 217 | **What happens?** 218 | 219 | - The **first `UPDATE`** goes through and deducts $100. 220 | - The **second `UPDATE`** fails because account 999 doesn’t exist. 221 | - The database sees an error and immediately knows it must **ROLLBACK**. 222 | - That means the $100 deduction is also undone. 223 | 224 | **Why is this good?** 225 | 226 | Without transactions, you’d have just lost $100 from account 1. Atomicity protects you from half-done operations. 227 | 228 | Here’s how you might catch this in application code (Python + psycopg2 example): 229 | 230 | ```python 231 | import psycopg2 232 | 233 | try: 234 | conn = psycopg2.connect(...) 235 | cur = conn.cursor() 236 | 237 | cur.execute("BEGIN;") 238 | cur.execute("UPDATE accounts SET balance = balance - 100 WHERE id = 1;") 239 | cur.execute("UPDATE accounts SET balance = balance + 100 WHERE id = 999;") 240 | 241 | conn.commit() 242 | 243 | except Exception as e: 244 | conn.rollback() 245 | print("Transaction failed and was rolled back:", e) 246 | 247 | finally: 248 | cur.close() 249 | conn.close() 250 | ``` 251 | 252 | Output: 253 | 254 | ``` 255 | Transaction failed and was rolled back: ERROR: account 999 does not exist 256 | ``` 257 | 258 | So the main idea is: **no partial changes allowed.** Either all steps complete, or the system undoes everything like nothing ever happened. 259 | -------------------------------------------------------------------------------- /notes/04_acid_properties_and_transactions/05_durability.md: -------------------------------------------------------------------------------- 1 | ## Durability in Database Transactions 2 | 3 | Durability is a fundamental principle in database systems that ensures once a transaction has been committed, its effects are permanent and will survive any subsequent system failures. This means that the data changes made by a transaction are safely stored and can be recovered even if the system crashes or experiences a power loss immediately afterward. 4 | 5 | Imagine that every time you save a file on your computer, you expect it to be there the next time you turn it on—even if there was an unexpected shutdown. Similarly, durability guarantees that committed transactions in a database are preserved, providing reliability and trust in the system. 6 | 7 | Once a transaction is committed, its changes are permanently recorded, even in the event of a system failure or crash: 8 | 9 | ``` 10 | +--------------------------+ 11 | | Transaction Successfully | 12 | | Committed | 13 | | (Changes Finalized) | 14 | +------------+-------------+ 15 | | 16 | v 17 | +--------------------------+ 18 | | Write-Ahead Log (WAL) | 19 | | (Persistent Log Entry) | 20 | +------------+-------------+ 21 | | 22 | v 23 | +--------------------------+ 24 | | Persistent Storage | 25 | | (Disk / SSD) | 26 | | (Data Remains Intact) | 27 | +--------------------------+ 28 | ``` 29 | 30 | - Once a transaction is successfully committed, its changes are considered final and should be immune to failures. 31 | - Before changes are applied to the primary data storage, they are first recorded in a durable log. This ensures that if a system crash occurs, the database can recover by replaying the WAL. 32 | - The changes are then written to durable storage (e.g., disk or SSD), guaranteeing that the transaction's effects remain, even if power is lost or the system crashes. 33 | 34 | After reading the material, you should be able to answer the following questions: 35 | 36 | 1. What is durability in database transactions, and how does it ensure that committed transactions remain permanent even in the event of system failures? 37 | 2. Why is durability important for maintaining data integrity and reliability in database systems? 38 | 3. What are the key techniques used to ensure durability, such as Write-Ahead Logging (WAL), checkpointing, and data replication, and how do they work? 39 | 4. How does the Two-Phase Commit Protocol (2PC) contribute to durability in distributed database environments? 40 | 5. Can you provide real-world examples of scenarios where durability is essential, and explain how durability mechanisms protect data in those cases? 41 | 42 | ### The Importance of Durability 43 | 44 | Durability plays a crucial role in maintaining the integrity and reliability of a database. By ensuring that committed transactions are not lost, it provides confidence that the data remains consistent and accurate over time. 45 | 46 | #### Ensuring Data Persistence 47 | 48 | Once a transaction is committed, durability guarantees that its changes are permanently recorded. This means that even in the face of hardware failures or system crashes, the data modifications are not lost and can be retrieved upon system recovery. 49 | 50 | #### Facilitating System Recovery 51 | 52 | In the event of a system failure, durability allows the database to recover to a consistent state by reapplying or confirming the committed transactions. This ensures that the database does not revert to an earlier state, preventing data loss and maintaining continuity. 53 | 54 | ### Real-World Examples 55 | 56 | To better understand how durability impacts everyday applications, let's explore some scenarios where this property is essential. 57 | 58 | #### Processing Online Orders 59 | 60 | Consider an e-commerce platform where customers place orders and the system updates inventory levels accordingly. 61 | 62 | - A customer completes a purchase, and the system commits the transaction that records the order details and adjusts the stock quantity. 63 | - If a power outage occurs immediately after the transaction commits, the order information and updated inventory levels are preserved. 64 | - When the system restarts, the customer's order is still recorded, and the inventory reflects the correct stock levels, ensuring accurate order fulfillment and inventory management. 65 | 66 | #### Handling Bank Transactions 67 | 68 | Imagine a banking system where funds are transferred between accounts. 69 | 70 | - A transaction debits $1,000 from Account A and credits $1,000 to Account B. 71 | - Once the transaction is committed, both accounts reflect the updated balances. 72 | - If the system crashes right after the commit, upon recovery, the database still shows the debited and credited amounts, preserving the integrity of the financial records. 73 | 74 | ### Techniques for Ensuring Durability 75 | 76 | Databases implement several mechanisms to guarantee that committed transactions remain durable, even in the face of unexpected failures. 77 | 78 | #### Write-Ahead Logging (WAL) 79 | 80 | Write-Ahead Logging is a method where changes are first recorded in a log before being applied to the database itself. 81 | 82 | - Before any modifications are made to the database, the changes are written to a persistent log file. If a system failure occurs, the database can use this log to redo the transactions upon restart. 83 | - This ensures that no committed transactions are lost, as the log provides a reliable record that can be used to restore the database to its correct state. 84 | 85 | #### Checkpointing 86 | 87 | Checkpointing involves periodically saving the current state of the database to stable storage. 88 | 89 | - At certain intervals, the database writes all in-memory changes to disk, creating a consistent snapshot. This reduces recovery time because only transactions after the last checkpoint need to be reapplied. 90 | - By minimizing the amount of data that needs to be recovered, checkpoints help the system return to normal operations more quickly after a failure. 91 | 92 | #### Data Replication 93 | 94 | Replication involves maintaining copies of the database on multiple servers or storage systems. 95 | 96 | - Committed transactions are synchronized across different nodes or locations. If one server fails, another can take over, ensuring that the data remains accessible. 97 | - Replication enhances durability by providing redundancy. Even in the event of hardware failure or data corruption on one server, the data remains safe and available on others. 98 | 99 | Alright, here’s the upgraded and clarified version of **“Visualizing Durability Mechanisms”**, with added detail, clearer structure, real-world analogies, and concrete SQL/logging output. Tone stays direct and to-the-point, like a friend walking you through what’s actually happening under the hood. 100 | 101 | ### Visualizing Durability Mechanisms 102 | 103 | **Durability** guarantees that once a transaction is committed, its results are permanent—even if the system crashes seconds later. If the database says, “Done,” it better mean it. 104 | 105 | Let’s look at how that works behind the scenes: 106 | 107 | ``` 108 | [Start Transaction] 109 | | 110 | [Write Changes to Log] 111 | | 112 | [Apply Changes to Database] 113 | | 114 | [Commit Transaction] 115 | | 116 | [Durability Ensured] 117 | ``` 118 | 119 | Each step exists to protect your data from disappearing into the void. Here's how it plays out: 120 | 121 | I. **Start Transaction** 122 | 123 | At this point, nothing’s permanent. You’re just signaling that some changes are about to happen. 124 | 125 | ```sql 126 | BEGIN; 127 | ``` 128 | 129 | II. **Write-Ahead Logging (WAL)** 130 | 131 | Before the actual data is changed, all actions are recorded in a transaction log. This is critical. The log is stored on disk immediately. 132 | 133 | ```plaintext 134 | LOG: UPDATE accounts SET balance = balance - 100 WHERE id = 1 135 | LOG: UPDATE accounts SET balance = balance + 100 WHERE id = 2 136 | ``` 137 | 138 | If the system crashes *after* this point but *before* applying changes to the actual data, the recovery system will use the log to **redo** the transaction. 139 | 140 | 📝 **Why this matters:** Logging comes *before* any changes are made. That’s why it’s called *Write-Ahead Logging* (WAL). 141 | 142 | III. **Apply Changes to the Database** 143 | 144 | Now the actual tables are updated. 145 | 146 | ```sql 147 | UPDATE accounts SET balance = balance - 100 WHERE id = 1; 148 | UPDATE accounts SET balance = balance + 100 WHERE id = 2; 149 | ``` 150 | 151 | These changes happen in memory first. They’ll be flushed to disk shortly, but not necessarily immediately. 152 | 153 | IV. **Commit Transaction** 154 | 155 | This is the point of no return. 156 | 157 | ```sql 158 | COMMIT; 159 | ``` 160 | 161 | The system writes a special *commit record* to the log. If that commit log entry exists, then the transaction is considered **durable**. 162 | 163 | #### What Happens If There’s a Crash? 164 | 165 | Imagine the system crashes **right after** the commit. What happens on recovery? 166 | 167 | - The system reads the log. 168 | - Sees the commit record. 169 | - Replays all the changes (if necessary) to make sure the database reflects them. 170 | 171 | Even if the data changes weren’t fully flushed to disk, the **log was**, and that’s enough to recover. 172 | 173 | #### Analogy: Save Before You Close 174 | 175 | Think of this like editing a document: 176 | - You make changes. 177 | - You hit **Ctrl+S** (which writes to the disk). 178 | - Then you close the app. 179 | 180 | Even if your laptop dies after closing, that save ensures your edits aren't lost. That’s durability. 181 | -------------------------------------------------------------------------------- /notes/05_storage_and_indexing/02_row_based_vs_column_based_databases.md: -------------------------------------------------------------------------------- 1 | ## Row-based and Column-based Databases 2 | 3 | Exploring the differences between row-based and column-based databases can help you make informed decisions about data storage and retrieval strategies. This guide delves into the characteristics, use cases, and trade-offs of these two database models, providing clarity on how each can impact performance and efficiency. 4 | 5 | ### Introduction 6 | 7 | Databases organize and store data in various ways to optimize for different types of workloads. The two primary storage models are row-based (row-oriented) and column-based (column-oriented) databases. Understanding these models is crucial for selecting the right database system for your application's needs. 8 | 9 | ### Characteristics of Row-based Databases 10 | 11 | In row-based databases, data is stored one row at a time, with each row containing all the attributes of a single record. This storage model aligns well with transactional systems where operations often involve entire records. 12 | 13 | - **Data Organization**: Rows are stored contiguously, making it efficient to read or write all columns of a record at once. 14 | - **Data Insertion and Updates**: Adding or modifying records is straightforward since the database deals with complete rows. 15 | - **Typical Use Cases**: Ideal for Online Transaction Processing (OLTP) systems like banking applications or e-commerce platforms, where quick, row-level operations are common. 16 | 17 | Here's a simple representation of row-based storage: 18 | 19 | ``` 20 | +---------------------------------------------+ 21 | | Row 1: [ID, Name, Age, Email, Address, ...] | 22 | | Row 2: [ID, Name, Age, Email, Address, ...] | 23 | | Row 3: [ID, Name, Age, Email, Address, ...] | 24 | +---------------------------------------------+ 25 | ``` 26 | 27 | Each row holds all the data for a single record, stored together on disk. 28 | 29 | ### Characteristics of Column-based Databases 30 | 31 | Column-based databases store data one column at a time, with each column containing data for a specific attribute across all records. This model is optimized for analytical queries that process large volumes of data but focus on a few attributes. 32 | 33 | - **Data Organization**: Columns are stored contiguously, allowing efficient access and compression of data. 34 | - **Read Efficiency**: Only the necessary columns are read during a query, reducing I/O operations. 35 | - **Typical Use Cases**: Suited for Online Analytical Processing (OLAP) systems like data warehouses or business intelligence applications, where aggregate functions and column-specific calculations are frequent. 36 | 37 | An illustration of column-based storage: 38 | 39 | ``` 40 | +------------------+------------------+------------------+ 41 | | Column: ID | Column: Name | Column: Age | 42 | | [ID1, ID2, ... ] | [Name1, Name2...]| [Age1, Age2, ...]| 43 | +------------------+------------------+------------------+ 44 | ``` 45 | 46 | Data for each attribute is stored separately, enhancing performance for column-centric operations. 47 | 48 | ### Use Cases and Examples 49 | 50 | #### Row-based Databases in Practice 51 | 52 | Consider a customer management system where each customer's complete profile needs to be accessed or updated regularly. A row-based database efficiently handles these operations. 53 | 54 | Example SQL command to retrieve a customer's full profile: 55 | 56 | ```sql 57 | SELECT * FROM customers WHERE customer_id = 12345; 58 | ``` 59 | 60 | This command retrieves all columns for the specified customer, benefiting from the contiguous storage of row-based databases. 61 | 62 | #### Column-based Databases in Practice 63 | 64 | In a scenario where a company wants to analyze sales trends over time, a column-based database can quickly process large datasets by focusing on relevant columns. 65 | 66 | Example SQL query to calculate total sales per month: 67 | 68 | ```sql 69 | SELECT month, SUM(sales_amount) FROM sales_data GROUP BY month; 70 | ``` 71 | 72 | The database reads only the `month` and `sales_amount` columns, making the operation faster and more efficient. 73 | 74 | ### Trade-offs Between the Models 75 | 76 | Each storage model offers advantages and disadvantages, impacting performance and storage requirements. 77 | 78 | #### Storage Efficiency 79 | 80 | - **Row-based Databases**: May use more disk space due to the storage of diverse data types together, which can limit compression effectiveness. 81 | - **Column-based Databases**: Often achieve higher compression ratios since similar data types are stored together, reducing storage costs. 82 | 83 | #### Query Performance 84 | 85 | - **Row-based Databases**: Perform well for queries that need full records but may be less efficient for aggregations on specific columns. 86 | - **Column-based Databases**: Excel at queries involving large datasets and specific columns, like statistical analyses or report generation. 87 | 88 | #### Write and Update Operations 89 | 90 | - **Row-based Databases**: Offer faster writes and updates since entire rows are handled in single operations. 91 | - **Column-based Databases**: Can be slower for writes and updates because data for each attribute is stored separately, potentially requiring multiple write operations. 92 | 93 | ### Practical Examples with Commands and Outputs 94 | 95 | #### Inserting Data in a Row-based Database (MySQL) 96 | 97 | When adding a new user to a row-based database: 98 | 99 | ```sql 100 | INSERT INTO users (user_id, name, email, age) 101 | VALUES (101, 'Alice Johnson', 'alice@example.com', 28); 102 | ``` 103 | 104 | - **Operation**: Inserts a complete record in one go. 105 | - **Efficiency**: Optimized for transactional operations that deal with full records. 106 | 107 | #### Querying Data in a Row-based Database 108 | 109 | Retrieving a user's full profile: 110 | 111 | ```sql 112 | SELECT * FROM users WHERE user_id = 101; 113 | ``` 114 | 115 | Expected output: 116 | 117 | | user_id | name | email | age | 118 | |---------|----------------|-------------------|-----| 119 | | 101 | Alice Johnson | alice@example.com | 28 | 120 | 121 | Interpretation: 122 | 123 | - All user information is fetched efficiently due to contiguous row storage. 124 | - Ideal for applications where full record access is common. 125 | 126 | #### Inserting Data in a Column-based Database (Apache Cassandra) 127 | 128 | Adding a new entry to a column-based database: 129 | 130 | ```sql 131 | INSERT INTO users (user_id, name, email, age) 132 | VALUES (101, 'Alice Johnson', 'alice@example.com', 28); 133 | ``` 134 | 135 | - **Operation**: Data is distributed across column families. 136 | - **Consideration**: May involve multiple write operations internally. 137 | 138 | #### Querying Data in a Column-based Database 139 | 140 | Fetching specific attributes: 141 | 142 | ```sql 143 | SELECT name, email FROM users WHERE user_id = 101; 144 | ``` 145 | 146 | Expected output: 147 | 148 | | name | email | 149 | |----------------|-------------------| 150 | | Alice Johnson | alice@example.com | 151 | 152 | Interpretation: 153 | 154 | - Only the requested columns are read, reducing unnecessary data retrieval. 155 | - Enhances performance for queries that don't require full records. 156 | 157 | ### Using Tables to Explain Command Options 158 | 159 | Understanding command options can be easier when presented in a table format. Here's an example using SQL query clauses: 160 | 161 | | Clause | Purpose | 162 | |-------------|------------------------------------------------| 163 | | `SELECT` | Specifies the columns to retrieve | 164 | | `FROM` | Indicates the table to query | 165 | | `WHERE` | Filters records based on conditions | 166 | | `GROUP BY` | Aggregates data across specified columns | 167 | | `ORDER BY` | Sorts the result set according to given columns| 168 | 169 | This table helps clarify the function of each clause in an SQL statement. 170 | 171 | ### ASCII Diagrams Illustrating Concepts 172 | 173 | #### Row-based Storage Visualization 174 | 175 | When data is stored in rows: 176 | 177 | ``` 178 | +----------------------------+ 179 | | Record 1: [A, B, C, D] | 180 | +----------------------------+ 181 | | Record 2: [E, F, G, H] | 182 | +----------------------------+ 183 | | Record 3: [I, J, K, L] | 184 | +----------------------------+ 185 | ``` 186 | 187 | All attributes of a record are stored together, facilitating quick access to full records. 188 | 189 | #### Column-based Storage Visualization 190 | 191 | When data is stored in columns: 192 | 193 | ``` 194 | +-----------+-----------+-----------+-----------+ 195 | | Column A | Column B | Column C | Column D | 196 | +-----------+-----------+-----------+-----------+ 197 | | A | B | C | D | 198 | | E | F | G | H | 199 | | I | J | K | L | 200 | +-----------+-----------+-----------+-----------+ 201 | ``` 202 | 203 | Data for each attribute is stored separately, enhancing performance for column-specific queries. 204 | 205 | ### Considering Hybrid Approaches 206 | 207 | Some database systems offer hybrid models to leverage the advantages of both storage types. 208 | 209 | - **Example**: Microsoft's SQL Server offers clustered columnstore indexes, allowing for both row-based and column-based storage within the same database. 210 | - **Benefit**: Supports a mix of transactional and analytical workloads by optimizing storage based on usage patterns. 211 | 212 | ### Performance Implications 213 | 214 | #### Data Compression 215 | 216 | Column-based databases can compress data more effectively due to the uniformity of data types within a column, leading to reduced storage costs and improved cache efficiency. 217 | 218 | #### I/O Operations 219 | 220 | - **Row-based Databases**: May perform more I/O operations when queries involve only a few columns but require reading entire rows. 221 | - **Column-based Databases**: Reduce I/O by reading only the necessary columns, which is beneficial for large-scale data analysis. 222 | 223 | -------------------------------------------------------------------------------- /notes/05_storage_and_indexing/03_primary_key_vs_secondary_key.md: -------------------------------------------------------------------------------- 1 | ## Primary Keys and Secondary Keys 2 | 3 | Grasping the concepts of primary and secondary keys is essential when working with relational databases. These keys play a pivotal role in ensuring data integrity, uniquely identifying records, and establishing relationships among different tables. Let's dive into what they are, how they function, and why they're important. 4 | 5 | ### Understanding Primary Keys 6 | 7 | A primary key in a database table is a column, or a set of columns, that uniquely identifies each row within that table. This means that no two rows can have the same primary key value, ensuring the uniqueness of every record. Additionally, primary keys cannot contain `NULL` values, meaning that every row must have a valid and unique identifier. 8 | 9 | For example, consider a `Users` table where each user has a unique `user_id`: 10 | 11 | | user_id | first_name | last_name | email | phone_number | 12 | |---------|------------|-----------|-------------------------|------------------| 13 | | 1 | Alice | Smith | alice.smith@example.com | (555) 123-4567 | 14 | | 2 | Bob | Johnson | bob.johnson@example.com | (555) 987-6543 | 15 | | 3 | Carol | Williams | carol.w@example.com | (555) 555-5555 | 16 | 17 | In this table, `user_id` serves as the primary key, uniquely identifying each user. 18 | 19 | #### Key Characteristics of Primary Keys 20 | 21 | - **Uniqueness**: Every value in the primary key column must be unique across the table. 22 | - **Non-nullability**: Primary keys cannot have `NULL` values; each record must have a value. 23 | - **Single Primary Key per Table**: A table can have only one primary key, which may consist of multiple columns (known as a composite key). 24 | - **Indexing**: Databases automatically create an index on the primary key to speed up data retrieval. 25 | - **Referential Integrity**: Primary keys can be referenced by foreign keys in other tables, establishing relationships between tables. 26 | 27 | ### Exploring Secondary Keys 28 | 29 | Secondary keys, also known as alternate or unique keys, are columns that also contain unique values but are not designated as the primary key. They provide additional ways to identify records uniquely and can be used to enforce uniqueness constraints on other important columns. 30 | 31 | Continuing with the `Users` table, the `email` and `phone_number` columns can serve as secondary keys since they are unique for each user: 32 | 33 | | user_id | first_name | last_name | email | phone_number | 34 | |---------|------------|-----------|-------------------------|------------------| 35 | | 1 | Alice | Smith | alice.smith@example.com | (555) 123-4567 | 36 | | 2 | Bob | Johnson | bob.johnson@example.com | (555) 987-6543 | 37 | | 3 | Carol | Williams | carol.w@example.com | (555) 555-5555 | 38 | 39 | #### Key Characteristics of Secondary Keys 40 | 41 | - **Uniqueness Constraints**: They ensure that values in the secondary key columns are unique, preventing duplicate entries. 42 | - **Multiple per Table**: A table can have multiple secondary keys. 43 | - **Nullable Values**: Secondary keys can contain `NULL` values unless explicitly defined as `NOT NULL`. 44 | - **Indexing for Performance**: Secondary keys are often indexed to improve query performance when searching by those columns. 45 | - **Alternate Access Paths**: They provide additional ways to access and reference records. 46 | 47 | ### How Primary and Secondary Keys Work Together 48 | 49 | Primary and secondary keys enhance the functionality and integrity of a database by ensuring unique identification and providing multiple ways to access data. 50 | 51 | #### Example: Orders Table 52 | 53 | Consider an `Orders` table where each order is uniquely identified by an `order_id`, the primary key: 54 | 55 | | order_id | user_id | product_id | order_date | order_status | 56 | |----------|---------|------------|------------|--------------| 57 | | 1 | 1 | 101 | 2023-04-01 | shipped | 58 | | 2 | 3 | 102 | 2023-04-03 | delivered | 59 | | 3 | 2 | 103 | 2023-04-05 | processing | 60 | 61 | Here, `order_id` is the primary key, and `user_id` serves as a foreign key that references the `user_id` in the `Users` table. This relationship links each order to the user who placed it. 62 | 63 | #### Visualizing Relationships 64 | 65 | An ASCII diagram can help illustrate the relationship between the `Users` and `Orders` tables: 66 | 67 | ``` 68 | +-----------+ +------------+ 69 | | Users | | Orders | 70 | |-----------| |------------| 71 | | user_id |<-----------| user_id | 72 | | first_name| | order_id | 73 | | last_name | | product_id | 74 | | ... | | ... | 75 | +-----------+ +------------+ 76 | ``` 77 | 78 | The arrow indicates that `user_id` in the `Orders` table references `user_id` in the `Users` table. 79 | 80 | ### Practical Commands and Outputs 81 | 82 | Understanding how to define and use primary and secondary keys involves working with SQL commands. Let's look at some examples. 83 | 84 | #### Creating a Table with Primary and Secondary Keys 85 | 86 | ```sql 87 | CREATE TABLE Users ( 88 | user_id INT PRIMARY KEY, 89 | first_name VARCHAR(50), 90 | last_name VARCHAR(50), 91 | email VARCHAR(100) UNIQUE NOT NULL, 92 | phone_number VARCHAR(20) UNIQUE 93 | ); 94 | ``` 95 | 96 | **Interpretation**: 97 | 98 | - The `user_id` column is set as the primary key. 99 | - The `email` column is defined as a unique secondary key and cannot be `NULL`. 100 | - The `phone_number` column is also a unique secondary key but can be `NULL`. 101 | 102 | #### Inserting Data and Enforcing Uniqueness 103 | 104 | When inserting data into the `Users` table: 105 | 106 | ```sql 107 | INSERT INTO Users (user_id, first_name, last_name, email, phone_number) 108 | VALUES (4, 'Dave', 'Brown', 'dave.brown@example.com', '(555) 222-3333'); 109 | ``` 110 | 111 | If you try to insert another user with the same `email`: 112 | 113 | ```sql 114 | INSERT INTO Users (user_id, first_name, last_name, email, phone_number) 115 | VALUES (5, 'Eve', 'Davis', 'dave.brown@example.com', '(555) 444-5555'); 116 | ``` 117 | 118 | **Output and Interpretation**: 119 | 120 | - The database will return an error: `ERROR: duplicate key value violates unique constraint "users_email_key"`. 121 | - This occurs because the `email` column must be unique, and using an existing email violates the uniqueness constraint enforced by the secondary key. 122 | 123 | #### Querying Data Using Secondary Keys 124 | 125 | To find a user by their `email`: 126 | 127 | ```sql 128 | SELECT * FROM Users WHERE email = 'dave.brown@example.com'; 129 | ``` 130 | 131 | **Output**: 132 | 133 | | user_id | first_name | last_name | email | phone_number | 134 | |---------|------------|-----------|------------------------|------------------| 135 | | 4 | Dave | Brown | dave.brown@example.com | (555) 222-3333 | 136 | 137 | **Interpretation**: 138 | 139 | - The query efficiently retrieves the user's information using the `email` secondary key, thanks to the index created on that column. 140 | 141 | ### Importance and Use Cases 142 | 143 | #### Primary Keys in Action 144 | 145 | Primary keys are vital for: 146 | 147 | - **Ensuring Data Integrity**: They prevent duplicate records, maintaining the uniqueness of each row. 148 | - **Establishing Relationships**: Primary keys are used in other tables as foreign keys to create links between data. 149 | - **Optimizing Performance**: Indexes on primary keys speed up query execution and data retrieval. 150 | 151 | #### Leveraging Secondary Keys 152 | 153 | Secondary keys enhance database functionality by: 154 | 155 | - **Enforcing Additional Uniqueness**: They ensure that important columns like `email` or `username` remain unique. 156 | - **Improving Query Performance**: Indexes on secondary keys allow for faster searches on those columns. 157 | - **Providing Flexibility**: They offer alternative ways to access and reference records beyond the primary key. 158 | 159 | ### Real-World Scenario: Products Table 160 | 161 | Consider a `Products` table where each product has a unique `product_id` as the primary key and a unique `sku` (Stock Keeping Unit) as a secondary key: 162 | 163 | | product_id | product_name | category | price | stock | sku | 164 | |------------|---------------|-----------|-------|-------|--------------| 165 | | 101 | Laptop | Computers | 999 | 50 | LAPTOP-12345 | 166 | | 102 | Smart Speaker | Audio | 49 | 200 | SPKR-67890 | 167 | | 103 | Monitor | Computers | 199 | 75 | MONITOR-4321 | 168 | 169 | - **Primary Key**: `product_id` uniquely identifies each product. 170 | - **Secondary Key**: `sku` provides another unique identifier, useful in inventory management and sales. 171 | -------------------------------------------------------------------------------- /notes/05_storage_and_indexing/04_database_pages.md: -------------------------------------------------------------------------------- 1 | ## Understanding Database Pages 2 | 3 | Diving into the fundamentals of database systems reveals that database pages are essential units of storage used to organize and manage data on disk. They play a pivotal role in how efficiently data is stored, retrieved, and maintained within a Database Management System (DBMS). Let's explore what database pages are, how they function, and why they're crucial for database performance. 4 | 5 | ### What Are Database Pages? 6 | 7 | In a DBMS, a database page is a fixed-length block of storage, serving as the basic unit for data transfer between the disk and memory. By using pages, the DBMS can read and write data in chunks, optimizing disk I/O operations and improving overall efficiency. 8 | 9 | Here's a simple illustration of a database page: 10 | 11 | ``` 12 | +-------------------------+ 13 | | Page Header | 14 | +-------------------------+ 15 | | Record 1 | 16 | +-------------------------+ 17 | | Record 2 | 18 | +-------------------------+ 19 | | ... | 20 | +-------------------------+ 21 | | Record N | 22 | +-------------------------+ 23 | | Free Space | 24 | +-------------------------+ 25 | ``` 26 | 27 | In this diagram, the page consists of a header containing metadata, followed by multiple records and any remaining free space. 28 | 29 | ### Characteristics of Database Pages 30 | 31 | #### Fixed Size 32 | 33 | Database pages typically have a fixed size, which can range from 2KB to 64KB, depending on the DBMS and its configuration. Common page sizes include 4KB, 8KB, and 16KB. The size of the page influences how data is stored and retrieved: 34 | 35 | - **Smaller Page Sizes**: Can reduce wasted space and are efficient for workloads with small, random I/O operations. 36 | - **Larger Page Sizes**: Can improve read/write performance for sequential data access but may increase memory usage if data is sparsely populated. 37 | 38 | #### Structured Organization 39 | 40 | Within each page, data is organized into slots or sections that hold individual records or parts of records. The structure depends on the storage model used: 41 | 42 | - **Row-Based Storage**: Stores entire rows together, ideal for transactional operations where complete records are frequently accessed. 43 | - **Column-Based Storage**: Stores data by columns, which is efficient for analytical queries that process specific attributes across many records. 44 | - **Hybrid Models**: Combine both approaches to optimize for diverse workloads. 45 | 46 | #### Page Header Metadata 47 | 48 | Every page begins with a header containing metadata that helps the DBMS manage and navigate the storage: 49 | 50 | - **Page Type**: Indicates the kind of data stored (e.g., data page, index page). 51 | - **Record Count**: Number of records or slots used within the page. 52 | - **Pointers**: References to other pages or records, facilitating quick data access and manipulation. 53 | 54 | ### The Role of Database Pages in Storage 55 | 56 | #### Data Allocation 57 | 58 | When new data is inserted into the database, the DBMS allocates space within pages to store this data: 59 | 60 | - If a page has enough free space, the new record is added to it. 61 | - If the page is full, the DBMS allocates a new page and may link it to the existing pages. 62 | 63 | This allocation strategy helps in maintaining data locality and efficient storage utilization. 64 | 65 | #### Indexing Mechanisms 66 | 67 | Indexes are crucial for fast data retrieval, and they rely heavily on pages: 68 | 69 | - **Index Pages**: Store index entries that map key values to the locations of the actual data records. 70 | - **Data Pages**: Contain the actual records referenced by the index entries. 71 | 72 | By organizing indexes and data across pages, the DBMS can quickly navigate from an index to the desired data. 73 | 74 | #### Data Retrieval Process 75 | 76 | When a query is executed, the DBMS determines which pages contain the relevant data: 77 | 78 | 1. **Locating Pages**: Uses indexes or scans to find the pages that need to be read. 79 | 2. **Reading Pages**: Loads the necessary pages from disk into memory. 80 | 3. **Extracting Data**: Retrieves the required records from the pages in memory. 81 | 82 | The efficiency of this process depends on factors like page size, data organization, and indexing. 83 | 84 | ### Performance Considerations 85 | 86 | #### Impact of Page Size 87 | 88 | Choosing the appropriate page size can significantly affect database performance: 89 | 90 | **Larger Pages**: 91 | 92 | - Reduce the number of I/O operations for large, sequential reads. 93 | - May lead to increased memory consumption and potential waste of space due to partially filled pages. 94 | 95 | **Smaller Pages**: 96 | 97 | - Minimize wasted space and can be more efficient for random access patterns. 98 | - Might require more I/O operations to read the same amount of data. 99 | 100 | Selecting the right page size involves balancing these trade-offs based on the specific workload and access patterns of your application. 101 | 102 | #### Managing Page Splits 103 | 104 | A page split occurs when a page becomes full, and the DBMS needs to split it to accommodate new data: 105 | 106 | **Consequences of Page Splits**: 107 | 108 | - Can lead to fragmentation, where related data is spread across non-contiguous pages. 109 | - May degrade performance due to increased I/O operations and cache misses. 110 | 111 | To mitigate the negative effects of page splits: 112 | 113 | - **Proper Indexing**: Designing efficient indexes can reduce the likelihood of page splits by organizing data more effectively. 114 | - **Fill Factor Adjustment**: Setting an appropriate fill factor reserves space within pages for future growth, delaying the need for splits. 115 | 116 | Understanding how page splits affect data storage can be visualized as: 117 | 118 | **Before Split**: 119 | 120 | ``` 121 | +-------------------------+ 122 | | Page Header | 123 | +-------------------------+ 124 | | Record 1 | 125 | +-------------------------+ 126 | | Record 2 | 127 | +-------------------------+ 128 | | Record 3 | 129 | +-------------------------+ 130 | | Record 4 | 131 | +-------------------------+ 132 | | Free Space | 133 | +-------------------------+ 134 | ``` 135 | 136 | **After Split (Page Full, New Record Inserted)**: 137 | 138 | ``` 139 | Page 1: Page 2: 140 | +-------------------------+ +-------------------------+ 141 | | Page Header | | Page Header | 142 | +-------------------------+ +-------------------------+ 143 | | Record 1 | | Record 4 | 144 | +-------------------------+ +-------------------------+ 145 | | Record 2 | | New Record | 146 | +-------------------------+ +-------------------------+ 147 | | Record 3 | | Free Space | 148 | +-------------------------+ +-------------------------+ 149 | | Free Space | +-------------------------+ 150 | +-------------------------+ 151 | ``` 152 | 153 | The data is split between two pages, which can increase the number of I/O operations needed to retrieve related records. 154 | 155 | 156 | ### Practical Examples and Commands 157 | 158 | #### Viewing Page Information in PostgreSQL 159 | 160 | You can inspect page-level details using PostgreSQL's `pageinspect` extension: 161 | 162 | I. **Enable the Extension**: 163 | 164 | ```sql 165 | CREATE EXTENSION pageinspect; 166 | ``` 167 | 168 | II. **Examine a Specific Page**: 169 | 170 | ```sql 171 | SELECT * FROM heap_page_items(get_raw_page('your_table', 0)); 172 | ``` 173 | 174 | This command retrieves information about the first page (`0`) of `your_table`. 175 | 176 | - **Item Offset**: Position of the record within the page. 177 | - **Item Length**: Size of the record in bytes. 178 | - **Heap Tuple Header**: Metadata about the individual record. 179 | - **Data**: Actual content of the record. 180 | 181 | #### Monitoring Page Splits in SQL Server 182 | 183 | In Microsoft SQL Server, you can track page splits using the `sys.dm_db_index_operational_stats` dynamic management view: 184 | 185 | ```sql 186 | SELECT 187 | OBJECT_NAME(object_id) AS TableName, 188 | index_id, 189 | leaf_insert_count, 190 | leaf_delete_count, 191 | leaf_update_count, 192 | leaf_page_split_count 193 | FROM sys.dm_db_index_operational_stats(DB_ID(), NULL, NULL, NULL); 194 | ``` 195 | 196 | **Output Interpretation**: 197 | 198 | - **TableName**: Name of the table being monitored. 199 | - **Index_ID**: Identifier for the index within the table. 200 | - **Leaf Page Split Count**: Number of times a leaf-level page split has occurred. 201 | 202 | Monitoring these metrics helps in diagnosing performance issues related to page splits and guiding optimization efforts. 203 | -------------------------------------------------------------------------------- /notes/06_distributed_databases/07_eventual_consistency.md: -------------------------------------------------------------------------------- 1 | ## Eventual Consistency 2 | 3 | Imagine a distributed system with multiple nodes—servers or databases—that share data. When an update occurs on one node, it doesn't instantly reflect on the others due to factors like network latency or processing delays. However, the system is designed so that all nodes will eventually synchronize their data. 4 | 5 | ``` 6 | Initial State: 7 | +-----------+ +-----------+ +-----------+ 8 | | v1 | | v1 | | v1 | 9 | | Node A | | Node B | | Node C | 10 | +-----------+ +-----------+ +-----------+ 11 | 12 | After Update on Node A: 13 | +-----------+ +-----------+ +-----------+ 14 | | v2 | | v1 | | v1 | 15 | | Node A | | Node B | | Node C | 16 | +-----------+ +-----------+ +-----------+ 17 | 18 | Time T1: 19 | +-----------+ +-----------+ +-----------+ 20 | | v2 | | v1 | | v1 | 21 | | Node A | | Node B | | Node C | 22 | +-----------+ +-----------+ +-----------+ 23 | 24 | Time T2: 25 | +-----------+ +-----------+ +-----------+ 26 | | v2 | | v2 | | v1 | 27 | | Node A | | Node B | | Node C | 28 | +-----------+ +-----------+ +-----------+ 29 | 30 | Time T3: 31 | +-----------+ +-----------+ +-----------+ 32 | | v2 | | v2 | | v2 | 33 | | Node A | | Node B | | Node C | 34 | +-----------+ +-----------+ +-----------+ 35 | ``` 36 | 37 | In this scenario, Node A receives an update changing the data from Version 1 to Version 2. Initially, only Node A has the latest version. Over time, the update propagates to Node B and eventually to Node C. By Time T3, all nodes have synchronized to Data Version 2, achieving eventual consistency. 38 | 39 | After reading the material, you should be able to answer the following questions: 40 | 41 | 1. What is eventual consistency in distributed database systems, and how does it differ from strong consistency models? 42 | 2. How does update propagation work in an eventually consistent system, and what factors influence the time it takes for all nodes to synchronize? 43 | 3. What are the main trade-offs of using eventual consistency, particularly regarding temporary inconsistencies and conflict resolution? 44 | 4. What are some common conflict resolution strategies used in eventually consistent systems, such as Last Write Wins, version vectors, and CRDTs? 45 | 5. In what real-world scenarios is eventual consistency particularly beneficial, and how do applications like social media platforms and content delivery networks leverage this consistency model? 46 | 47 | ### Characteristics of Eventual Consistency 48 | 49 | Eventual consistency is considered a **weak consistency model** compared to strong consistency models like linearizability. It allows systems to remain highly available and responsive by permitting temporary inconsistencies. 50 | 51 | #### High Availability and Performance 52 | 53 | By not requiring immediate synchronization across all nodes, systems can process read and write operations without delay. This approach reduces latency because nodes do not need to wait for confirmation from other nodes before responding to a request. 54 | 55 | #### Scalability 56 | 57 | Eventual consistency supports scalability in distributed systems. Nodes can handle requests and updates locally without constant coordination with other nodes, allowing the system to accommodate a growing number of nodes and handle increased loads efficiently. 58 | 59 | ### How Update Propagation Works 60 | 61 | Updates in an eventually consistent system propagate to other nodes asynchronously. The time it takes for updates to reach all nodes depends on factors like network latency, replication mechanisms, and system load. 62 | 63 | #### Example of Update Propagation: 64 | 65 | 1. A user updates their profile picture on a social media platform. 66 | 2. The update is saved on one server (Node A). 67 | 3. Node A asynchronously replicates the change to other servers (Nodes B and C). 68 | 4. Friends accessing the user's profile through different nodes might see the old picture until the update reaches those nodes. 69 | 5. Eventually, all nodes reflect the new profile picture. 70 | 71 | ### Trade-offs of Eventual Consistency 72 | 73 | While eventual consistency offers advantages in availability and performance, it introduces certain trade-offs that need to be managed carefully. 74 | 75 | #### Temporary Inconsistencies 76 | 77 | During the propagation delay, different nodes may hold different versions of the data. This can lead to clients reading stale or outdated information. Applications need to handle these inconsistencies appropriately, perhaps by: 78 | 79 | - Providing mechanisms for conflict resolution. 80 | - Informing users about the potential for stale data. 81 | - Designing operations that can tolerate inconsistencies. 82 | 83 | #### Conflict Resolution 84 | 85 | When multiple nodes update the same data simultaneously, conflicts can arise. The system must have strategies in place to resolve these conflicts and ensure that all nodes eventually agree on the final state of the data. 86 | 87 | ##### Common Conflict Resolution Strategies: 88 | 89 | - **Last Write Wins** resolves conflicts by using timestamps to ensure the most recent update overwrites previous ones. 90 | - **Version vectors** track data versions across replicas to detect and resolve conflicts intelligently. 91 | - **Merge functions** employ application-specific logic to combine or reconcile conflicting updates. 92 | - **CRDTs (Conflict-Free Replicated Data Types)** leverage specialized data structures to handle concurrent updates without introducing conflicts. 93 | 94 | ### Practical Examples of Eventual Consistency 95 | 96 | Eventual consistency is well-suited for applications where immediate consistency is not critical, but high availability and responsiveness are essential. 97 | 98 | #### Social Media Platforms 99 | 100 | On platforms like Twitter or Facebook, when a user posts a new update, it might not appear instantly on all friends' feeds due to propagation delays. However, the post will eventually become visible to everyone. This delay is generally acceptable in exchange for the ability to handle millions of users simultaneously. 101 | 102 | #### Collaborative Editing Tools 103 | 104 | In tools like Google Docs, multiple users can edit the same document concurrently. Changes made by one user might not appear immediately to others, but over time, all edits are synchronized, and the document reflects all contributions. The system ensures eventual consistency while allowing users to work without interruption. 105 | 106 | #### Content Delivery Networks (CDNs) 107 | 108 | CDNs cache content at various nodes around the world to serve users with low latency. When content is updated, the new version needs to propagate to all cache nodes. Until the update reaches a particular node, users served by that node might receive the older version. Over time, as caches refresh, all users receive the updated content. 109 | 110 | ### Benefits of Eventual Consistency 111 | 112 | Embracing eventual consistency allows distributed systems to achieve: 113 | 114 | - **High throughput** is achieved as the system avoids immediate synchronization, allowing for a greater volume of operations. 115 | - **Fault tolerance** enables the system to remain operational even when some nodes are unreachable, with updates propagating once connectivity is restored. 116 | - **User experience** improves through continuous availability, particularly in scenarios where slight delays in achieving data consistency are acceptable. 117 | 118 | ### Considerations When Using Eventual Consistency 119 | 120 | Applications relying on eventual consistency need to account for its characteristics in their design. 121 | 122 | #### Designing for Inconsistency 123 | 124 | Applications should handle cases where data may be outdated. For example: 125 | 126 | - Displaying messages indicating that data is being updated. 127 | - Allowing users to refresh or manually sync data. 128 | - Implementing retries for failed operations. 129 | 130 | #### Understanding Consistency Requirements 131 | 132 | Not all applications can tolerate temporary inconsistencies. Systems handling financial transactions or inventory management may require stronger consistency models to prevent errors. It's important to assess consistency requirements based on the application's domain. 133 | 134 | ### Implementing Eventual Consistency 135 | 136 | Implementing eventual consistency involves designing systems that can handle delayed updates and resolve conflicts effectively. 137 | 138 | #### Update Propagation Mechanisms 139 | 140 | Updates can be propagated using various methods: 141 | 142 | - **Gossip protocols** enable nodes to randomly share information with peers, gradually disseminating updates across the network. 143 | - **Asynchronous replication** allows updates to be sent to other nodes without waiting for acknowledgments, ensuring the originating node can continue handling requests. 144 | - **Publish-subscribe systems** ensure nodes receive updates by subscribing to changes, with updates broadcast to all subscribers. 145 | 146 | #### Handling Conflicts with Version Vectors 147 | 148 | Version vectors help track the history of data updates to resolve conflicts. 149 | 150 | ##### Example of Version Vectors: 151 | 152 | - **Node A** updates the data, creating a version vector `[A:1, B:0, C:0]`, indicating its update. 153 | - **Node B** simultaneously updates the same data, resulting in a version vector `[A:0, B:1, C:0]`. 154 | - During synchronization, the nodes compare version vectors to identify conflicting updates and merge changes according to predefined rules. 155 | -------------------------------------------------------------------------------- /notes/07_concurrency_control/01_shared_vs_exclusive_locks.md: -------------------------------------------------------------------------------- 1 | ## Shared and Exclusive Locks 2 | 3 | Shared and exclusive locks are used in database systems for managing concurrent access to data. They ensure that transactions occur without conflicting with each other, maintaining the integrity and consistency of the database. 4 | 5 | ``` 6 | [Resource: Data Item X] 7 | | 8 | |-- Transaction A wants to READ --> Acquires SHARED LOCK 9 | |-- Transaction B wants to READ --> Acquires SHARED LOCK 10 | | 11 | [Both can read simultaneously] 12 | 13 | [Resource: Data Item Y] 14 | | 15 | |-- Transaction C wants to WRITE --> Acquires EXCLUSIVE LOCK 16 | | 17 | [No other transaction can read or write until the lock is released] 18 | ``` 19 | 20 | In the diagram above, Transactions A and B both acquire shared locks on Data Item X, allowing them to read the data at the same time without interference. Transaction C, however, obtains an exclusive lock on Data Item Y to perform a write operation, preventing other transactions from accessing it until the operation is complete. 21 | 22 | After reading the material, you should be able to answer the following questions: 23 | 24 | 1. What are shared and exclusive locks in database systems, and how do they differ in terms of access permissions for transactions? 25 | 2. How do shared and exclusive locks interact with each other, and what does the lock compatibility matrix illustrate about their behavior? 26 | 3. Can you provide examples of scenarios where shared locks are appropriate and where exclusive locks are necessary to maintain data integrity? 27 | 4. What best practices can be followed to balance concurrency and data integrity when using shared and exclusive locks in transactions? 28 | 5. How do deadlocks occur in the context of shared and exclusive locks, and what strategies can be implemented to prevent and resolve them? 29 | 30 | ### Understanding Shared Locks 31 | 32 | Shared locks allow multiple transactions to read the same data concurrently. They are vital for operations where data needs to be read without being modified, ensuring that the data remains consistent for all reading transactions. 33 | 34 | Imagine a library database where several users are looking up the same book information. Each user's transaction places a shared lock on the book's data, allowing everyone to read the information simultaneously without any conflicts. 35 | 36 | ### Exploring Exclusive Locks 37 | 38 | Exclusive locks grant a single transaction the sole right to read and modify a piece of data. This lock type is necessary when a transaction needs to ensure that no other transactions can interfere with its operation, such as when updating or deleting data. 39 | 40 | Consider an online banking system where a user is transferring money from one account to another. The transaction places an exclusive lock on both account records to prevent other transactions from reading or modifying the balances until the transfer is complete, ensuring the accuracy of the transaction. 41 | 42 | ### Interaction Between Shared and Exclusive Locks 43 | 44 | Understanding how shared and exclusive locks interact is essential for managing database concurrency effectively. 45 | 46 | | Lock Held \ Requested | Shared Lock Requested | Exclusive Lock Requested | 47 | | ----------------------- | --------------------- | ------------------------ | 48 | | **Shared Lock Held** | Allowed | Not Allowed | 49 | | **Exclusive Lock Held** | Not Allowed | Not Allowed | 50 | 51 | - When a shared lock is already held on a data item, other transactions can also acquire shared locks on it. 52 | - If a shared lock is held, an exclusive lock request will be blocked until all shared locks are released. 53 | - When an exclusive lock is held, all other lock requests (shared or exclusive) are blocked until the exclusive lock is released. 54 | 55 | ### Practical Examples with Commands 56 | 57 | These examples illustrate row-level locking behavior common to most modern relational databases—**PostgreSQL**, **MySQL/InnoDB**, **MariaDB**, **SQL Server**, and **Oracle**—which support shared (S) and exclusive (X) locks at the row level. They do **not** apply to engines or table types without row-level locking (e.g., MySQL’s **MyISAM**), nor to NoSQL stores that use different concurrency controls. 58 | 59 | #### Shared vs. Exclusive Locks: Applicability 60 | 61 | * **Supported**: PostgreSQL, MySQL/InnoDB, MariaDB, SQL Server, Oracle. 62 | * **Not Supported**: MySQL/MyISAM (table-level only), SQLite (uses database-level or page-level locks), many cloud-managed NoSQL databases. 63 | 64 | Locking behavior may vary slightly by isolation level and vendor syntax; the following examples assume the default **READ COMMITTED** isolation level. 65 | 66 | #### Example: Reading Data (Shared Lock) 67 | 68 | In databases with row-level locking, a **shared lock** (S) permits multiple transactions to read the same rows concurrently but prevents any transaction from modifying them until all shared locks are released. 69 | 70 | ```sql 71 | -- Applies in PostgreSQL, MySQL/InnoDB, SQL Server, Oracle 72 | BEGIN TRANSACTION; 73 | SELECT * FROM Employees WHERE Department = 'Sales'; 74 | -- Shared (S) lock on matching rows until COMMIT 75 | COMMIT; 76 | ``` 77 | 78 | #### Example: Updating Data (Exclusive Lock) 79 | 80 | An **exclusive lock** (X) is required for row modifications. If another transaction holds a shared or exclusive lock on the same row, the update waits (or may deadlock under certain patterns). 81 | 82 | ```sql 83 | -- Applies in PostgreSQL, MySQL/InnoDB, SQL Server, Oracle 84 | BEGIN TRANSACTION; 85 | UPDATE Employees SET Salary = Salary * 1.05 WHERE Department = 'Sales'; 86 | -- Request X lock: waits until no other S or X locks exist on those rows 87 | COMMIT; 88 | ``` 89 | 90 | #### Lock Interaction Timeline 91 | 92 | | Step | Transaction | Action | Lock Held | Outcome | 93 | | ---- | ------------ | --------------------------------- | ------------------------ | ------------------------------------------------ | 94 | | 1 | T1 (Reader) | `SELECT ... FOR SHARE` (implicit) | S on Sales rows | Allows other S locks; blocks X locks | 95 | | 2 | T2 (Updater) | `UPDATE ...` | Requests X on Sales rows | Waits until T1 commits and releases its S lock | 96 | | 3 | T1 | `COMMIT` | Releases S | T2 acquires X lock, performs update, then COMMIT | 97 | 98 | > **Note**: Some databases (e.g., Oracle) require explicit `SELECT ... FOR UPDATE` to acquire row locks for reads; others implicitly lock on `UPDATE`. 99 | 100 | #### Considerations and Variations 101 | 102 | * Under **SERIALIZABLE**, readers may acquire additional locks or trigger predicate locks. Under **READ UNCOMMITTED**, shared locks may be skipped (dirty reads). 103 | * MyISAM uses table-level locks, so the above does not apply. SQLite uses page or database locks. 104 | * If two transactions request locks in opposite order, a deadlock may occur. Most RDBMS detect and kill the victim. 105 | 106 | ### Balancing Concurrency and Integrity 107 | 108 | Efficient database systems strive to balance the need for high concurrency with the necessity of maintaining data integrity. Locks play a pivotal role in achieving this balance. Here are the key concepts: 109 | 110 | - Shared locks enable multiple transactions to **read the same data simultaneously**, enhancing concurrency and system throughput. 111 | - Exclusive locks restrict access to a resource for modifications, ensuring **data integrity** by preventing conflicts and data corruption during concurrent updates. 112 | - Locking mechanisms must be carefully managed to avoid **deadlocks**, where two or more transactions wait indefinitely for each other to release locks. 113 | - Transaction isolation levels, such as **serializable** and **read committed**, provide a framework for managing concurrency while maintaining data consistency. 114 | 115 | ### Best Practices for Using Locks 116 | 117 | To optimize database performance while ensuring data integrity, the following practices are recommended: 118 | 119 | - Transactions should be designed to **minimize the duration of locks** by keeping operations concise, reducing contention and blocking of other processes. 120 | - Lock granularity should be chosen carefully, with **row-level locks** preferred over table-level locks for fine-grained control, promoting greater concurrency. 121 | - Avoiding unnecessary locks helps reduce overhead; for instance, adopting a **read-uncommitted isolation level** can be beneficial in scenarios where occasional dirty reads are acceptable. 122 | - Deadlock detection and resolution mechanisms should be implemented to **automatically identify and address circular locking scenarios**, ensuring system stability. 123 | - Prioritize using **optimistic concurrency control** techniques, such as timestamp-based validation, in read-heavy systems to reduce locking frequency. 124 | - Regularly monitor and analyze transaction logs to **identify bottlenecks and locking conflicts**, enabling proactive adjustments to database configuration or schema. 125 | - Employ **indexing strategies** to limit the range of locks required, as properly indexed queries reduce the amount of data scanned and locked. 126 | 127 | ### Deadlocks and How to Handle Them 128 | 129 | Deadlocks occur when two or more transactions are waiting indefinitely for each other to release locks. 130 | 131 | ``` 132 | Deadlock Scenario: 133 | 134 | Transaction 1: 135 | LOCK Resource A 136 | WAIT for Resource B 137 | 138 | Transaction 2: 139 | LOCK Resource B 140 | WAIT for Resource A 141 | ``` 142 | 143 | In this situation, Transaction 1 holds a lock on Resource A and waits for Resource B, while Transaction 2 holds a lock on Resource B and waits for Resource A. Neither can proceed, resulting in a deadlock. 144 | 145 | **Strategies to Prevent Deadlocks:** 146 | 147 | - Establishing **resource ordering** ensures that locks are acquired in a consistent sequence, which prevents circular wait conditions from arising. 148 | - Setting a **lock timeout** allows transactions to fail gracefully by limiting the maximum time a lock request can wait, avoiding indefinite blocking. 149 | - Implementing **deadlock detection** systems enables the identification of deadlock situations, allowing resolution by aborting one of the conflicting transactions. 150 | - Using a **wait-die or wound-wait algorithm** enforces a structured priority-based approach to manage transactions and prevent deadlocks. 151 | - Designing transactions to **lock resources in bulk** at the beginning reduces the chances of mid-transaction lock conflicts, which can trigger deadlocks. 152 | - Minimizing **long-running transactions** reduces the risk of lock contention, as shorter transactions are less likely to encounter deadlock situations. 153 | - Optimizing **index usage** and query design decreases the number of locks required, reducing the probability of lock-related conflicts. 154 | - Regularly reviewing and analyzing **deadlock logs** aids in understanding the root causes and refining locking strategies accordingly. 155 | -------------------------------------------------------------------------------- /notes/08_database_performance/02_indexing_strategies.md: -------------------------------------------------------------------------------- 1 | ## Indexing Strategies 2 | 3 | Indexes play a crucial role in enhancing database query performance by allowing quick data retrieval without scanning every row in a table. Different indexing strategies are suited for various use cases and data types. Let's explore four common indexing strategies: **B-tree**, **Bitmap**, **Hash**, and **Full-Text** indexes. We'll delve into how they work, when to use them, and provide examples to illustrate their implementation. 4 | 5 | ### B-tree Indexing Strategy 6 | 7 | B-tree (Balanced Tree) indexes are one of the most widely used indexing methods in databases. They maintain sorted data in a tree-like structure, which allows for efficient insertion, deletion, and lookup operations. 8 | 9 | #### Understanding B-tree Indexes 10 | 11 | Imagine a library where books are organized alphabetically by title. Finding a book doesn't require scanning every single one; instead, you can quickly navigate to the section with the first letter, then to the specific book. B-tree indexes work similarly by keeping data sorted and balanced, ensuring that operations can be performed in logarithmic time. 12 | 13 | Here's a simplified ASCII representation of a B-tree: 14 | 15 | ``` 16 | Tree: 17 | [M] 18 | / \ 19 | [G] [T] 20 | / \ / \ 21 | [A-F] [H-L] [N-S] [U-Z] 22 | ``` 23 | 24 | In this diagram: 25 | 26 | - Each node represents a range of values. 27 | - The tree is balanced, meaning all leaf nodes are at the same level. 28 | - Searching for a value involves traversing from the root to a leaf node, making decisions at each node. 29 | 30 | #### When to Use B-tree Indexes 31 | 32 | B-tree indexes are ideal for columns where you frequently perform range queries, sorting, or need fast access to individual records based on a key. They are the default index type in many databases because of their versatility. 33 | 34 | #### Example in PostgreSQL 35 | 36 | Suppose you have a `customers` table with a `last_name` column, and you often search for customers by their last name or need to list them in alphabetical order. 37 | 38 | Creating a B-tree index on the `last_name` column: 39 | 40 | ```sql 41 | CREATE INDEX idx_customers_last_name ON customers(last_name); 42 | ``` 43 | 44 | By doing this, queries like the following will execute more efficiently: 45 | 46 | ```sql 47 | SELECT * FROM customers WHERE last_name = 'Smith'; 48 | ``` 49 | 50 | #### How B-tree Indexes Improve Performance 51 | 52 | Without an index, the database would perform a full table scan to find all customers with the last name 'Smith'. With the B-tree index, it can quickly locate the matching entries without scanning unnecessary rows. 53 | 54 | ### Bitmap Indexing Strategy 55 | 56 | Bitmap indexes are designed for columns with a limited number of distinct values, known as low cardinality. They use bit arrays (bitmaps) to represent the presence or absence of a value, allowing for fast logical operations. 57 | 58 | #### Understanding Bitmap Indexes 59 | 60 | Consider a table that records survey responses, with a `gender` column that can be 'Male', 'Female', or 'Other'. A bitmap index creates a separate bitmap for each distinct value: 61 | 62 | ``` 63 | Gender Column: 64 | Row IDs: 1 2 3 4 5 6 7 8 9 65 | 66 | 'Male' Bitmap: 1 0 1 0 1 0 1 0 1 67 | 'Female' Bitmap: 0 1 0 1 0 1 0 1 0 68 | 'Other' Bitmap: 0 0 0 0 0 0 0 0 0 69 | ``` 70 | 71 | - Each bitmap is a sequence of bits where each bit represents a row. 72 | - A '1' indicates the presence of the value in that row; a '0' indicates absence. 73 | - Logical operations (AND, OR) can be performed quickly across bitmaps. 74 | 75 | #### When to Use Bitmap Indexes 76 | 77 | Bitmap indexes are particularly effective in data warehousing environments where queries often involve multiple conditions on low-cardinality columns. They excel at combining conditions using logical operations. 78 | 79 | #### Example in Oracle Database 80 | 81 | Suppose you have a `sales` table with a `region` column that has a small set of possible values ('North', 'South', 'East', 'West'). 82 | 83 | Creating a bitmap index: 84 | 85 | ```sql 86 | CREATE BITMAP INDEX idx_sales_region ON sales(region); 87 | ``` 88 | 89 | This index speeds up queries like: 90 | 91 | ```sql 92 | SELECT COUNT(*) FROM sales WHERE region = 'North' AND product_category = 'Electronics'; 93 | ``` 94 | 95 | #### Advantages and Considerations 96 | 97 | - **Advantages**: Efficient for complex queries involving multiple low-cardinality columns; reduced storage space due to bitmap compression. 98 | - **Considerations**: Not suitable for columns with high cardinality; bitmap indexes can cause contention issues in environments with frequent updates. 99 | 100 | ### Hash Indexing Strategy 101 | 102 | Hash indexes use a hash function to map values to a location in a hash table, enabling fast retrieval for equality comparisons. 103 | 104 | #### Understanding Hash Indexes 105 | 106 | Imagine a library where each book is assigned a unique code generated by a hash function based on the book's title. When you want to find a book, you input the title into the hash function and go directly to the location where it's stored. 107 | 108 | Here's a conceptual diagram: 109 | 110 | ``` 111 | Hash Function: H(value) -> Location 112 | 113 | Values: 114 | - 'Apple' -> H('Apple') -> Location 5 115 | - 'Banana' -> H('Banana') -> Location 2 116 | - 'Cherry' -> H('Cherry') -> Location 8 117 | ``` 118 | 119 | - The hash function computes a location for each value. 120 | - Lookup is performed by computing the hash of the search value and accessing the corresponding location. 121 | 122 | #### When to Use Hash Indexes 123 | 124 | Hash indexes are suitable for columns where you frequently perform equality searches, such as primary keys or unique identifiers. They are not suitable for range queries or sorting because the hash function does not preserve order. 125 | 126 | #### Example in PostgreSQL 127 | 128 | Suppose you have a `users` table with a `username` column that you want to index for fast lookup. 129 | 130 | Creating a hash index: 131 | 132 | ```sql 133 | CREATE INDEX idx_users_username ON users USING HASH (username); 134 | ``` 135 | 136 | Querying with the index: 137 | 138 | ```sql 139 | SELECT * FROM users WHERE username = 'johndoe'; 140 | ``` 141 | 142 | #### Limitations 143 | 144 | - **No Range Queries**: Cannot efficiently handle queries like `username > 'j'`. 145 | - **Collision Handling**: Hash functions can produce the same hash for different inputs, requiring collision resolution. 146 | 147 | ### Full-Text Indexing Strategy 148 | 149 | Full-text indexes are designed to handle complex searches within large text fields, such as searching for specific words or phrases in documents. 150 | 151 | #### Understanding Full-Text Indexes 152 | 153 | Consider a search engine that indexes the content of web pages to allow users to search for specific terms. Full-text indexing involves creating an inverted index that maps words to the documents they appear in. 154 | 155 | Simplified diagram: 156 | 157 | ``` 158 | Word-to-Document Mapping: 159 | 160 | 'Database' -> Doc1, Doc3 161 | 'Index' -> Doc2, Doc3 162 | 'Query' -> Doc1, Doc2 163 | ``` 164 | 165 | - Each word is linked to the documents containing it. 166 | - Searching for documents containing 'Database' and 'Index' involves finding the intersection of their lists. 167 | 168 | #### When to Use Full-Text Indexes 169 | 170 | Full-text indexes are ideal for columns containing large amounts of text where you need to perform searches based on words or phrases, such as product descriptions, articles, or comments. 171 | 172 | #### Example in PostgreSQL 173 | 174 | Suppose you have an `articles` table with a `content` column containing the text of each article. 175 | 176 | Creating a full-text index: 177 | 178 | ```sql 179 | CREATE INDEX idx_articles_content ON articles USING GIN (to_tsvector('english', content)); 180 | ``` 181 | 182 | Searching using the index: 183 | 184 | ```sql 185 | SELECT title FROM articles WHERE to_tsvector('english', content) @@ to_tsquery('database & indexing'); 186 | ``` 187 | 188 | This query finds articles that contain both 'database' and 'indexing'. 189 | 190 | #### Features and Benefits 191 | 192 | - **Advanced Search Capabilities**: Supports stemming, ranking, and phrase searching. 193 | - **Performance**: Significantly improves the speed of text searches compared to scanning entire text fields. 194 | 195 | ### Best Practices for Indexing 196 | 197 | Effective indexing improves query performance while managing resource costs. 198 | 199 | #### Analyze Query Patterns 200 | 201 | - Applications performing equality searches benefit from hash indexes, which provide fast lookups for exact matches. 202 | - Queries involving range conditions or sorting operations are well-suited to B-tree indexes, which support ordered data access. 203 | - Columns with low cardinality may perform better with bitmap indexes, especially in cases involving multiple conditions. 204 | - Text fields requiring search functionality can use full-text indexes for optimized searching. 205 | 206 | #### Monitor and Maintain Indexes 207 | 208 | - Fragmentation over time reduces performance, making it useful to rebuild indexes periodically. 209 | - Keeping database statistics updated is important for the query optimizer to make informed decisions. 210 | - Excessive indexing should be avoided, as it can degrade performance of write-intensive operations such as INSERT, UPDATE, and DELETE. 211 | 212 | #### Balance Performance and Resources 213 | 214 | - Indexing should focus on columns that are frequently queried to avoid unnecessary overhead. 215 | - Queries filtering on multiple columns simultaneously can benefit from composite indexes. 216 | - Storage costs should be evaluated, particularly for large datasets, to balance performance benefits against space requirements. 217 | 218 | #### Testing and Iteration 219 | 220 | - Developing indexes in a controlled environment allows for testing their impact on query performance without affecting production systems. 221 | - Query execution can be analyzed using EXPLAIN plans to ensure indexes are being utilized effectively. 222 | - Continuous monitoring after deployment helps identify any unexpected impacts, enabling further adjustments as needed. 223 | -------------------------------------------------------------------------------- /notes/09_database_replication/04_synchronous_vs_asynchronous_replication.md: -------------------------------------------------------------------------------- 1 | ## Synchronous and Asynchronous Replication 2 | 3 | Replication is an important concept in database systems, involving the copying of data from one database server, known as the primary, to one or more other servers called replicas. This process enhances data availability, fault tolerance, and load balancing across the system. Understanding the two main replication strategies—synchronous and asynchronous replication—is crucial for designing robust and efficient database architectures. 4 | 5 | ### Replication Strategies 6 | 7 | At its core, replication ensures that data is consistently available across multiple servers. The key difference between synchronous and asynchronous replication lies in how and when data changes are propagated from the primary server to the replicas. 8 | 9 | #### Synchronous Replication 10 | 11 | In synchronous replication, every write operation on the primary database is immediately propagated to the replicas. The primary server waits for acknowledgments from all replicas before confirming the transaction to the client. This means that data is consistent across all servers at any given moment. 12 | 13 | **How it works:** 14 | 15 | 1. A client sends a write request to the primary server. 16 | 2. The primary server writes the data and sends the changes to all replicas. 17 | 3. Each replica writes the data and sends an acknowledgment back to the primary server. 18 | 4. Once all acknowledgments are received, the primary server confirms the transaction to the client. 19 | 20 | ``` 21 | Client 22 | | 23 | | (1) Write Request 24 | v 25 | +--------------------+ 26 | | Primary Server | 27 | +---------+----------+ 28 | | 29 | | (2) Send Data to Replicas 30 | v 31 | +---------+----------+ +---------+----------+ 32 | | Replica 1 | | Replica 2 | 33 | +---------+----------+ +---------+----------+ 34 | | (3) Ack | (3) Ack 35 | +-----------+----------------+ 36 | | 37 | (4) Confirm Transaction 38 | | 39 | v 40 | Transaction Confirmed to Client 41 | 42 | ``` 43 | 44 | | **Advantages** | **Disadvantages** | 45 | | ------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------ | 46 | | Ensures strong data consistency across all servers | Increases latency because the primary server waits for acknowledgments from replicas | 47 | | Minimizes the risk of data loss since data is committed on all servers before confirmation | May impact performance, especially in environments with high network latency | 48 | | Simplifies failover processes because replicas are always up-to-date | Scalability can be limited due to the overhead of maintaining synchronization | 49 | 50 | #### Asynchronous Replication 51 | 52 | Asynchronous replication allows the primary server to confirm transactions without waiting for replicas to acknowledge the data writes. Data changes are sent to replicas after the transaction has been committed on the primary server, which means there may be a delay before replicas are updated. 53 | 54 | **How it works:** 55 | 56 | 1. A client sends a write request to the primary server. 57 | 2. The primary server writes the data and immediately confirms the transaction to the client. 58 | 3. The primary server queues the data changes for replication. 59 | 4. Replicas receive the data changes asynchronously and update their data. 60 | 61 | ``` 62 | Client 63 | | 64 | | (1) Write Request 65 | v 66 | +--------------------+ 67 | | Primary Server | 68 | +---------+----------+ 69 | | (2) Immediate ACK to Client 70 | | 71 | | (3) Send Data to Replicas 72 | v 73 | +---------+----------+ +---------+----------+ 74 | | Replica 1 | | Replica 2 | 75 | +---------+----------+ +---------+----------+ 76 | | (4) ACK | (4) ACK 77 | +-----------+----------------+ 78 | | 79 | [Replication Complete] 80 | ``` 81 | 82 | | **Advantages** | **Disadvantages** | 83 | | ---------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------- | 84 | | Reduces latency since the primary server doesn't wait for replicas | Potential for data inconsistency between the primary and replicas | 85 | | Improves performance and throughput on the primary server | Risk of data loss if the primary server fails before replication occurs | 86 | | More scalable in environments with high network latency or geographically distributed replicas | More complex failover procedures may be required to ensure data integrity | 87 | 88 | ### Choosing Between Synchronous and Asynchronous Replication 89 | 90 | Selecting the appropriate replication strategy depends on the specific needs of your application and infrastructure. 91 | 92 | | **Category** | **Synchronous Replication** | **Asynchronous Replication** | 93 | | ---------------------------- | ------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------- | 94 | | **Data Consistency** | Strong—writes are confirmed only once all replicas have committed, ensuring identical data across nodes. | Eventual—primary confirms writes immediately; replicas catch up afterward, so briefly divergent states are possible. | 95 | | **Latency Impact** | Higher—each transaction waits for replica acknowledgments, adding round-trip delays. | Lower—primary does not wait, so transactions complete as soon as local commit is done. | 96 | | **Throughput & Performance** | Moderate—overall throughput can suffer under high-load or high-latency conditions due to synchronization overhead. | High—primary server can handle more transactions per second without waiting on replicas. | 97 | | **Scalability** | Limited—scaling to many or geographically distant replicas exacerbates latency and coordination costs. | Excellent—replicas can be added anywhere without significantly affecting primary performance. | 98 | | **Failover Complexity** | Simple—since replicas are up-to-date, promoting one to primary is straightforward. | Complex—need to detect and reconcile any unreplicated transactions; risk of data loss on failover. | 99 | | **Risk of Data Loss** | Minimal—as long as a majority (or all, depending on quorum) of replicas acknowledge, data is safe. | Present—writes acknowledged by primary may not yet exist on replicas if a sudden failure occurs. | 100 | | **Typical Use Cases** | ● Financial transaction systems
● Order-entry platforms
● Catalog updates requiring atomicity | ● Global content distribution
● Analytics or logging pipelines
● High-performance web applications where slight lag is acceptable | 101 | | **Best Network Conditions** | Low-latency, high-bandwidth links (e.g., within the same data center or region). | Variable or high-latency networks (e.g., cross-continent, multi-cloud or edge deployments). | 102 | 103 | ### Best Practices 104 | 105 | Implementing replication effectively requires careful planning and consideration of several factors. 106 | 107 | **Application Requirements:** 108 | 109 | - Assess the criticality of data consistency versus performance needs. 110 | - Determine acceptable levels of latency and potential data loss. 111 | - Plan for failure scenarios and how the system should respond. 112 | 113 | **Monitoring and Maintenance:** 114 | 115 | - Regularly monitor replication status and lag times. 116 | - Set up alerting mechanisms for replication failures or significant delays. 117 | - Perform routine testing of failover procedures. 118 | 119 | **Optimizing Network Infrastructure:** 120 | 121 | - Ensure reliable, high-speed network connections between servers. 122 | - Use network optimization techniques to reduce latency. 123 | - Consider network security measures to protect data during replication. 124 | 125 | **Data Safety Measures:** 126 | 127 | - Maintain regular backups, even when using replication. 128 | - Implement transaction logging to assist with recovery if needed. 129 | - Periodically validate data consistency between the primary and replicas. 130 | -------------------------------------------------------------------------------- /notes/10_nosql_databases/01_nosql_databases_intro.md: -------------------------------------------------------------------------------- 1 | ## NoSQL Databases 2 | 3 | NoSQL (Not Only SQL) databases are non-relational data storage systems that offer flexible schemas and scalable performance for handling large volumes of unstructured or semi-structured data. Unlike traditional relational databases that use tables and fixed schemas, NoSQL databases accommodate a wide variety of data models, making them suitable for modern applications that require rapid development, horizontal scalability, and real-time processing. 4 | 5 | After reading the material, you should be able to answer the following questions: 6 | 7 | 1. What are the main types of NoSQL databases, and what are their primary use cases? 8 | 2. How do NoSQL databases achieve scalability and flexibility compared to traditional relational databases? 9 | 3. What are the advantages of using document stores and key-value stores in NoSQL databases? 10 | 4. What are some of the common disadvantages associated with NoSQL databases, particularly regarding ACID compliance and data consistency? 11 | 5. How do graph databases differ from other NoSQL database types, and in what scenarios are they most effectively utilized? 12 | 13 | ### Types of NoSQL Databases 14 | 15 | NoSQL databases are classified based on their data models, each optimized for specific use cases and offering unique advantages. 16 | 17 | #### 1. Document Stores 18 | 19 | - Document stores manage data in documents using formats like JSON, BSON, or XML. 20 | - Each document contains semi-structured data that can vary in structure, providing flexibility and ease of evolution. 21 | - Common use cases include content management systems, blogging platforms, user profiles, and e-commerce product catalogs. 22 | - Examples of document stores are MongoDB and CouchDB. 23 | 24 | **Example Document:** 25 | 26 | ```json 27 | { 28 | "_id": "user123", 29 | "name": "John Doe", 30 | "email": "john@example.com", 31 | "preferences": { 32 | "language": "en", 33 | "timezone": "UTC" 34 | } 35 | } 36 | ``` 37 | 38 | #### 2. Key-Value Stores 39 | 40 | - Key-value stores are the simplest type of NoSQL databases, storing data as a collection of key-value pairs. 41 | - The key serves as a unique identifier, and the value is the data associated with the key, which can be a simple string or a complex object. 42 | - Use cases include caching, session management, user preferences, shopping carts, and real-time analytics. 43 | - Examples of key-value stores are Redis, Riak, and Amazon DynamoDB. 44 | 45 | **Example Key-Value Pair:** 46 | 47 | ```plaintext 48 | Key: "session_12345" 49 | Value: 50 | { 51 | "user_id": "user1", 52 | "cart": ["item1", "item2"], 53 | "expires": "2024-09-14T12:00:00Z" 54 | } 55 | ``` 56 | 57 | #### 3. Column Stores (Wide Column Stores) 58 | 59 | - Column stores organize data into rows and columns but allow for variable numbers of columns per row. 60 | - They use column families to group related data, making them efficient for querying large datasets where certain columns are accessed frequently. 61 | - Use cases include event logging, time-series data, IoT data storage, and analytical applications. 62 | - Examples of column stores are Apache Cassandra and Apache HBase. 63 | 64 | **Data Representation:** 65 | 66 | | Row Key | Name | Age | Location | Last Login | 67 | |----------|-------|-----|----------|--------------| 68 | | user123 | Alice | 30 | NYC | 2024-09-14 | 69 | | user456 | Bob | 25 | LA | 2024-09-13 | 70 | 71 | #### 4. Graph Databases 72 | 73 | - Graph databases represent data as nodes (entities) and edges (relationships), allowing for complex relationships and interconnections to be efficiently stored and queried. 74 | - Use cases include social networks, recommendation engines, fraud detection, and knowledge graphs. 75 | - Examples of graph databases are Neo4j and Amazon Neptune. 76 | 77 | **Example Relationships:** 78 | 79 | - [Alice] follows [Bob]. 80 | - [Alice] likes [Post: "Understanding NoSQL"]. 81 | 82 | ### Characteristics of NoSQL Databases 83 | 84 | - NoSQL databases offer schema flexibility, allowing for dynamic changes to data models without downtime. 85 | - They are designed for horizontal scalability, distributing data across multiple nodes or servers. 86 | - High availability is achieved through built-in replication and partitioning, ensuring continuous operation even during node failures. 87 | - The distributed architecture stores data across multiple locations, enhancing fault tolerance and accessibility. 88 | - Some systems provide eventual consistency, where data changes propagate asynchronously, prioritizing availability over immediate consistency. 89 | 90 | ### Advantages of NoSQL Databases 91 | 92 | #### Scalability 93 | 94 | - Horizontal scaling allows adding more servers or nodes to accommodate growing data and traffic. 95 | - Data is partitioned and stored across multiple nodes, improving read/write throughput. 96 | 97 | #### Flexibility 98 | 99 | - Schema-less design permits storage of varied and evolving data structures. 100 | - Supports diverse data types, including structured, semi-structured, and unstructured data. 101 | 102 | #### Performance 103 | 104 | - Optimized for specific access patterns, handling high read or write loads efficiently. 105 | - In-memory processing in databases like Redis provides fast access, ideal for caching and real-time analytics. 106 | 107 | #### High Availability and Fault Tolerance 108 | 109 | - Data replication across multiple nodes enhances data durability and enables failover mechanisms. 110 | - Systems continue to operate despite network partitions or node failures, which is critical for applications requiring continuous availability. 111 | 112 | #### Easy Integration with Modern Architectures 113 | 114 | - Seamless integration with serverless architectures and platforms like AWS Lambda reduces operational overhead. 115 | - Supports microservices and event-driven systems, facilitating decoupled services that can scale independently. 116 | 117 | #### Rapid Development and Prototyping 118 | 119 | - Quick setup with minimal configuration allows developers to focus on application logic. 120 | - Flexible schemas support iterative development and rapid changes, accelerating time-to-market. 121 | 122 | #### Efficient Handling of Nested Data 123 | 124 | - Embedded documents store complex data structures within a single document, eliminating the need for expensive join operations. 125 | - Naturally models hierarchical data, simplifying data retrieval and manipulation. 126 | 127 | **Example of Nested Data in MongoDB:** 128 | 129 | ```json 130 | { 131 | "_id": "order123", 132 | "customer": { 133 | "customer_id": "cust456", 134 | "name": "Jane Smith" 135 | }, 136 | "items": [ 137 | { 138 | "product_id": "prod789", 139 | "quantity": 2, 140 | "price": 19.99 141 | }, 142 | { 143 | "product_id": "prod012", 144 | "quantity": 1, 145 | "price": 9.99 146 | } 147 | ], 148 | "order_date": "2024-09-14T10:30:00Z" 149 | } 150 | ``` 151 | 152 | ### Disadvantages of NoSQL Databases 153 | 154 | #### Limited ACID Compliance 155 | 156 | - Many NoSQL databases lack full support for multi-document or multi-statement transactions. 157 | - Eventual consistency models can result in temporary inconsistencies, which may not be acceptable for certain applications. 158 | 159 | #### Complexity 160 | 161 | - Lack of a standardized query language requires learning database-specific query syntaxes. 162 | - Data modeling can be more complex, often involving denormalization and data duplication. 163 | 164 | #### Maturity and Tooling 165 | 166 | - Some NoSQL databases have less mature ecosystems compared to relational databases. 167 | - There may be fewer third-party tools, ORMs, and integrations available. 168 | 169 | #### Consistency Models 170 | 171 | - Prioritizing availability and partition tolerance often means compromising on strong consistency. 172 | - Developers may need to handle consistency and conflict resolution at the application level, adding complexity. 173 | -------------------------------------------------------------------------------- /notes/11_security_best_practices/06_sql_injection.md: -------------------------------------------------------------------------------- 1 | ## SQL Injection Attacks 2 | 3 | SQL Injection Attacks are a security concern in web applications. We'll explore how these attacks occur, examine concrete examples, and discuss effective prevention strategies. By the end of this journey, you'll have a solid understanding of SQL Injection and how to protect your applications from such vulnerabilities. 4 | 5 | After reading the material, you should be able to answer the following questions: 6 | 7 | - What is SQL Injection, and why is it a significant security threat in web applications? 8 | - How do SQL Injection attacks occur, and what makes an application vulnerable to them? 9 | - Can you describe common examples of SQL Injection attacks and their potential impacts? 10 | - What are the most effective strategies for preventing SQL Injection attacks in applications? 11 | - How do parameterized queries and input validation contribute to securing applications against SQL Injection? 12 | 13 | ### What is SQL Injection? 14 | 15 | SQL Injection is a technique where attackers exploit vulnerabilities in an application's interaction with its database. By inserting malicious SQL code into input fields, they can manipulate queries to access unauthorized data, modify or delete records, and even take control of the entire database. Think of it as someone sneaking harmful instructions into a conversation between your application and its database. 16 | 17 | ### How Does SQL Injection Happen? 18 | 19 | At the core, SQL Injection occurs when user input is incorporated directly into SQL queries without proper validation or sanitization. This unfiltered input can alter the structure of the SQL commands, leading to unintended and potentially dangerous outcomes. 20 | 21 | I. **User Input Submission** 22 | 23 | Users provide input through forms, URL parameters, or other data entry points. 24 | 25 | II. **Query Construction** 26 | 27 | The application builds SQL queries by combining static code with user input. 28 | 29 | III. **Query Execution** 30 | 31 | The database executes the constructed query, which may have been tampered with if the input was malicious. 32 | 33 | ``` 34 | [ User Input ] --> [ Application ] --> [ Query Construction ] --> [ Database Execution ] 35 | ``` 36 | 37 | ### Vulnerable Code Example 38 | 39 | Imagine a login form where users enter their username and password. A vulnerable application might handle this input as follows: 40 | 41 | ```php 42 | 0) { 55 | echo "Welcome, $username!"; 56 | } else { 57 | echo "Invalid username or password."; 58 | } 59 | ?> 60 | ``` 61 | 62 | In this example, user inputs `$username` and `$password` are directly embedded into the SQL query without any checks. This opens the door for SQL Injection. 63 | 64 | ### Examples of SQL Injection Attacks 65 | 66 | Let's explore how attackers can exploit such vulnerabilities with real-world scenarios. 67 | 68 | #### Authentication Bypass 69 | 70 | An attacker aims to gain unauthorized access by bypassing the login authentication. 71 | 72 | The attacker inputs the following: 73 | 74 | - **Username:** `admin' --` 75 | - **Password:** `irrelevant` 76 | 77 | ```sql 78 | SELECT * FROM users WHERE username = 'admin' --' AND password = 'irrelevant' 79 | ``` 80 | 81 | - The `--` sequence comments out the rest of the SQL query. 82 | - The query effectively becomes: 83 | 84 | ```sql 85 | SELECT * FROM users WHERE username = 'admin' 86 | ``` 87 | 88 | - The password check is bypassed, granting access to the 'admin' account. 89 | - The attacker successfully logs in as 'admin' without knowing the password. 90 | - They gain full administrative privileges within the application. 91 | 92 | ``` 93 | [ Malicious Input ] 94 | | 95 | v 96 | [ Altered Query ] 97 | | 98 | v 99 | [ Unauthorized Access ] 100 | ``` 101 | 102 | #### Data Extraction 103 | 104 | An attacker tries to retrieve sensitive information from the database. 105 | 106 | The attacker inputs: 107 | 108 | - **Username:** `john' UNION SELECT username, password FROM users --` 109 | - **Password:** `irrelevant` 110 | 111 | ```sql 112 | SELECT * FROM users WHERE username = 'john' UNION SELECT username, password FROM users --' AND password = 'irrelevant' 113 | ``` 114 | 115 | - The `UNION` operator combines the results of two queries. 116 | - The attacker forces the database to return all usernames and passwords. 117 | - The application may display or process the combined data. 118 | - The attacker gains access to credentials of all users. 119 | 120 | #### Data Manipulation 121 | 122 | An attacker wants to modify data, such as elevating their privileges. 123 | 124 | The attacker inputs: 125 | 126 | - **Username:** `'; UPDATE users SET role='admin' WHERE username='attacker'; --` 127 | - **Password:** `irrelevant` 128 | 129 | ```sql 130 | SELECT * FROM users WHERE username = ''; UPDATE users SET role='admin' WHERE username='attacker'; --' AND password = 'irrelevant' 131 | ``` 132 | 133 | - The first query selects a user with an empty username. 134 | - The second query updates the attacker's role to 'admin'. 135 | - The `--` comments out the rest of the original query. 136 | - The attacker's account now has administrative privileges. 137 | - They can perform actions reserved for admins, compromising security. 138 | 139 | #### Denial of Service 140 | 141 | An attacker aims to disrupt the database's functionality. 142 | 143 | The attacker inputs: 144 | 145 | - **Username:** `'; DROP TABLE users; --` 146 | - **Password:** `irrelevant` 147 | 148 | ```sql 149 | SELECT * FROM users WHERE username = ''; DROP TABLE users; --' AND password = 'irrelevant' 150 | ``` 151 | 152 | - The `DROP TABLE users` command deletes the entire users table. 153 | - The application loses all user data, causing it to fail. 154 | - The database is severely compromised. 155 | - Recovery may require restoring from backups, resulting in downtime. 156 | 157 | ### Preventing SQL Injection Attacks 158 | 159 | Understanding prevention is crucial to safeguard applications from SQL Injection. 160 | 161 | #### Use Parameterized Queries (Prepared Statements) 162 | 163 | Parameterized queries ensure that user input is treated strictly as data, not executable code. 164 | 165 | **Secure Code Example in PHP using PDO:** 166 | 167 | ```php 168 | prepare('SELECT * FROM users WHERE username = :username AND password = :password'); 175 | 176 | // Bind parameters 177 | $stmt->bindParam(':username', $username); 178 | $stmt->bindParam(':password', $password); 179 | 180 | // Execute the statement 181 | $stmt->execute(); 182 | 183 | // Check if user exists 184 | if ($stmt->rowCount() > 0) { 185 | echo "Welcome, $username!"; 186 | } else { 187 | echo "Invalid username or password."; 188 | } 189 | ?> 190 | ``` 191 | 192 | - The query structure is fixed, and parameters are bound separately. 193 | - Even if an attacker supplies malicious input, it won't alter the query's logic. 194 | 195 | ``` 196 | [ User Input ] --> [ Application ] --> [ Parameterized Query ] --> [ Safe Execution ] 197 | ``` 198 | 199 | #### Validate and Sanitize User Input 200 | 201 | Always check that inputs meet expected criteria before using them. 202 | 203 | ```php 204 | 214 | ``` 215 | 216 | - Prevents injection of special characters. 217 | - Ensures input conforms to expected patterns. 218 | 219 | #### Use Stored Procedures 220 | 221 | Stored procedures are precompiled SQL statements stored in the database, which can be executed with parameters. 222 | 223 | ```sql 224 | DELIMITER // 225 | CREATE PROCEDURE AuthenticateUser(IN p_username VARCHAR(50), IN p_password VARCHAR(50)) 226 | BEGIN 227 | SELECT * FROM users WHERE username = p_username AND password = p_password; 228 | END // 229 | DELIMITER ; 230 | ``` 231 | 232 | Calling the Stored Procedure in PHP: 233 | 234 | ```php 235 | prepare('CALL AuthenticateUser(:username, :password)'); 242 | $stmt->bindParam(':username', $username); 243 | $stmt->bindParam(':password', $password); 244 | $stmt->execute(); 245 | 246 | // Check if user exists 247 | if ($stmt->rowCount() > 0) { 248 | echo "Welcome, $username!"; 249 | } else { 250 | echo "Invalid username or password."; 251 | } 252 | ?> 253 | ``` 254 | 255 | - The SQL code is predefined and not altered by user input. 256 | - Parameters are handled securely by the database. 257 | 258 | #### Implement Least Privilege Principle 259 | 260 | Limit the database permissions of the application's user account. 261 | 262 | - Grant only necessary privileges (e.g., `SELECT`, `INSERT`). 263 | - Avoid using database admin accounts in the application. 264 | 265 | Example of Restricting Privileges in MySQL: 266 | 267 | ```sql 268 | GRANT SELECT, INSERT, UPDATE ON mydatabase.users TO 'app_user'@'localhost' IDENTIFIED BY 'securepassword'; 269 | ``` 270 | 271 | - Even if an attacker gains some level of access, the damage is limited. 272 | - Critical operations like dropping tables are not permitted. 273 | 274 | #### Escape User Input 275 | 276 | If parameterized queries aren't available, ensure special characters are properly escaped. 277 | 278 | ```php 279 | 287 | ``` 288 | 289 | - Escaping reduces risk but isn't foolproof. 290 | - Prefer parameterized queries when possible. 291 | -------------------------------------------------------------------------------- /notes/11_security_best_practices/07_crash_recovery_in_databases.md: -------------------------------------------------------------------------------- 1 | ## Understanding Crash Recovery in Databases 2 | 3 | Crash recovery is a important component of database systems that ensures data consistency and durability despite unexpected events like power outages, hardware failures, or software crashes. By design, databases must be capable of returning to a reliable state after a failure occurs. This is largely accomplished through mechanisms like the Write-Ahead Log (WAL), which records changes before they are committed to the actual data files on disk. 4 | 5 | After reading the material, you should be able to answer the following questions: 6 | 7 | - What is crash recovery in database systems, and why is it important? 8 | - How does the Write-Ahead Log (WAL) facilitate crash recovery? 9 | - What is the role of checkpointing in the crash recovery process? 10 | - What are the main steps a database follows during crash recovery after a system restart? 11 | - What are the benefits and trade-offs associated with WAL-based recovery mechanisms? 12 | 13 | ### The Basics of Crash Recovery 14 | 15 | Databases typically cache data in memory (often called the buffer pool) for speed. When a change is made, such as adding a row or updating an existing row, it is applied first in memory. Only later is this modified data flushed to disk. If a crash or power loss happens mid-write, data could end up corrupted or partially written. Crash recovery techniques help the database detect and correct any inconsistencies by replaying or discarding in-flight changes. 16 | 17 | ### The Role of the Write-Ahead Log (WAL) 18 | 19 | The WAL, sometimes called the redo log, keeps track of all modifications. Every time data is changed in memory, a record of that change is written to the WAL on disk before the database eventually writes the changed data pages to disk. 20 | 21 | - The WAL is appended in a strictly sequential manner, which is efficient for most disk types and reduces write overhead. 22 | - Because each modification is recorded in the log, the WAL acts as the authoritative record of what changed in the database. 23 | - If the system crashes, the database can use the WAL to redo committed changes that may not have made it to the data files, or ignore changes for uncommitted transactions. 24 | 25 | ``` 26 | WAL: 27 | +----------------+ 28 | | Application | 29 | | Transaction | 30 | +--------+-------+ 31 | | 32 | | 1. Emit transaction 33 | v 34 | +--------+-------+ 35 | | Write-Ahead | 36 | | Log (WAL) | 37 | +--------+-------+ 38 | | 39 | +-------------------------------+-------------------------------+ 40 | | | 41 | | 2a. Append record to WAL buffer | 42 | | | 43 | | +----------------+ +----------------+ | 44 | | | WAL In-Memory | --(write)--> | WAL On-Disk | | 45 | | | Buffer | | (log file) | | 46 | | +----------------+ +----------------+ | 47 | | | | | 48 | | | 2b. Fsync (sync to durable storage) | | 49 | | +------------------------------------>+ | 50 | | | 51 | +---------------------------------------------------------------+ 52 | | 53 | | 3. Apply logged changes 54 | v 55 | +--------+-------+ 56 | | Data Store | 57 | | (Main DB) | 58 | +----------------+ 59 | ``` 60 | 61 | ### WAL and Transaction States 62 | 63 | Databases manage transactions to make sure atomicity (all or nothing). The WAL is directly tied to these transaction guarantees: 64 | 65 | - Once a transaction commits, its entries in the WAL are written to disk. Even if a crash occurs immediately afterward, the committed changes can be replayed from the WAL upon restart. 66 | - If the system crashes before these transactions commit, the database treats them as rolled back. Uncommitted WAL entries are discarded or ignored during recovery. 67 | 68 | ### Checkpointing 69 | 70 | A checkpoint operation flushes all in-memory data pages to disk and writes a special checkpoint record to the WAL. This makes the on-disk data more up-to-date and reduces the amount of log replay needed if a crash occurs. 71 | 72 | - Frequent checkpoints mean there is less WAL data to replay during restart. 73 | - Writing all in-memory data pages to disk can be expensive, especially for large or very active databases. 74 | - Administrators tune checkpoint frequency to balance acceptable recovery time with acceptable performance during normal operations. 75 | 76 | ### Crash Recovery Steps 77 | 78 | When a database restarts after a crash, it goes through a sequence of steps to make sure a consistent state: 79 | 80 | 1. The database checks the latest **checkpoint** in the WAL to identify the last checkpoint. 81 | 2. Committed transactions after the checkpoint are applied to the data files to bring them up to date. 82 | 3. Any uncommitted transactions in the WAL are discarded or rolled back so they do not appear as valid changes. 83 | 4. The database finishes replaying WAL records and transitions back to handling regular queries. 84 | 85 | ### Flushing the WAL 86 | 87 | Some databases offer configuration options for controlling how often the WAL is physically written and synchronized to disk: 88 | 89 | - Ensures the operating system flushes the WAL to stable storage, guaranteeing durability. 90 | - Allows multiple transactions to commit before flushing, reducing the total number of disk writes at the cost of slightly delayed durability. 91 | - Stricter flushing maintains stronger guarantees but can lower throughput for write-heavy workloads. 92 | 93 | ``` 94 | +===========================+ 95 | | Database System | 96 | +===========================+ 97 | | 98 | | Initiate Transaction 99 | v 100 | +---------------------------+ 101 | | Client/System | 102 | +---------------------------+ 103 | | 104 | | Generate Log Record 105 | v 106 | +---------------------------+ 107 | | WAL Buffer (RAM) | 108 | | ----------------------- | 109 | | | Log Record 1 | | 110 | | | Log Record 2 | | 111 | | | Log Record 3 | | 112 | | | ... | | 113 | | ----------------------- | 114 | +---------------------------+ 115 | | 116 | | Sequential Write (Flush) 117 | v 118 | +---------------------------+ 119 | | Stable Storage (Disk) | 120 | | ----------------------- | 121 | | | Log Record 1 | | 122 | | | Log Record 2 | | 123 | | | Log Record 3 | | 124 | | | ... | | 125 | | ----------------------- | 126 | +---------------------------+ 127 | | 128 | | Apply Changes to Database 129 | v 130 | +---------------------------+ 131 | | Database Files | 132 | | ----------------------- | 133 | | | Data Page A | | 134 | | | Data Page B | | 135 | | | ... | | 136 | | ----------------------- | 137 | +---------------------------+ 138 | ``` 139 | 140 | ### Benefits of WAL-Based Recovery 141 | 142 | - Committed transactions survive power loss or crashes, thanks to the WAL. 143 | - WAL records are appended sequentially, which matches well with disk I/O patterns. 144 | - WAL entries can be streamed to secondary systems for real-time or near-real-time replication. 145 | - The WAL can be archived and replayed on top of a previous full backup to reach a desired point in time. 146 | 147 | ### Drawbacks and Trade-Offs 148 | 149 | - WAL files occupy extra disk space that administrators must monitor and manage. 150 | - Every change is written at least twice—once to the WAL, then later to the actual data file. 151 | - Frequent checkpoints can spike I/O usage and temporarily slow other operations. 152 | - Adjusting checkpoint intervals, flush frequencies, and other parameters requires careful tuning to find the right balance between performance and reliability. 153 | 154 | ### Practical Example 155 | 156 | Consider an `orders` table: 157 | 158 | | OrderID | CustomerID | Status | 159 | |---------|-----------|-----------| 160 | | 1 | 1001 | Pending | 161 | | 2 | 1002 | Shipped | 162 | | 3 | 1003 | Delivered | 163 | 164 | Suppose a user updates `OrderID = 1` from `Pending` to `Shipped`. 165 | 166 | I. The database modifies the in-memory page representing `OrderID = 1`. 167 | 168 | II. A corresponding record showing the old and new values (`Pending` -> `Shipped`) is appended to the WAL on disk. 169 | 170 | III. The data file containing the `orders` table may not be updated immediately. 171 | 172 | IV. If the database crashes at this point, the WAL can be replayed to recover the change. 173 | 174 | V. After restart, the database replays the WAL entries for all committed transactions, ensuring `OrderID = 1` is set to `Shipped` in the data file. 175 | 176 | ### Visualizing Crash Recovery 177 | 178 | ``` 179 | # 180 | +-----------------+ 181 | Changes in --> | Memory | 182 | the database | (Buffer Pool) | 183 | +--------+--------+ 184 | | 185 | WAL Record Written 186 | | 187 | v 188 | +-----------------+ 189 | | Write-Ahead Log | 190 | | (Redo Log) | 191 | +--------+--------+ 192 | | 193 | Checkpoint ---------->+ 194 | (Flush data pages) . 195 | . 196 | . 197 | v 198 | +------------------+ 199 | | Data Files on | 200 | | Disk | 201 | +------------------+ 202 | ``` 203 | 204 | - The buffer pool stores active data pages in memory. 205 | - All changes are recorded in the WAL on disk before data files are updated. 206 | - A checkpoint flushes the current in-memory state of data to disk and records this action in the WAL. 207 | - After a crash, the database replays committed transactions from the WAL and ignores uncommitted changes. 208 | 209 | ### Best Practices 210 | 211 | I. Find the right interval to minimize both I/O spikes and recovery time. 212 | 213 | II. Make sure adequate storage capacity and regularly archive or clean old WAL files. 214 | 215 | III. Configure fsync to make sure that WAL data truly resides on stable media. 216 | 217 | IV. Combine periodic full backups with continuous WAL archiving for point-in-time recovery. 218 | 219 | V. Validate recovery settings in staging environments to confirm that the database can recover from abrupt failures. 220 | -------------------------------------------------------------------------------- /notes/12_database_engines/03_postgresql.md: -------------------------------------------------------------------------------- 1 | ## PostgreSQL 2 | - PostgreSQL: a powerful, open-source object-relational database management system (ORDBMS) 3 | - Developed and maintained by the PostgreSQL Global Development Group 4 | - Suitable for a wide range of applications, from small-scale projects to enterprise-level systems 5 | 6 | ## Features 7 | 8 | ### ACID Compliance 9 | Supports ACID transactions, ensuring data consistency and reliability 10 | 11 | ### Extensibility 12 | - Allows custom functions, operators, data types, and index methods 13 | - Supports stored procedures, triggers, and views 14 | 15 | ### Concurrency Control 16 | Uses Multi-Version Concurrency Control (MVCC) to handle concurrent access without locking 17 | 18 | ### Robust Security 19 | Offers strong encryption, authentication, and authorization mechanisms 20 | 21 | ### Cross-Platform 22 | Compatible with various operating systems, including Windows, macOS, Linux, and Unix 23 | 24 | ### Full-Text Search 25 | Built-in support for text search and advanced indexing 26 | 27 | ### Spatial Data Support 28 | Support for geographic objects and spatial queries through PostGIS extension 29 | 30 | ### High Availability and Replication 31 | Supports various replication methods, including streaming replication and logical replication 32 | 33 | ## PostgreSQL Commands 34 | 35 | ### Creating a Database 36 | 37 | ``` 38 | CREATE DATABASE database_name; 39 | ``` 40 | 41 | ### Creating Tables 42 | 43 | ``` 44 | CREATE TABLE table_name ( 45 | column_name1 datatype PRIMARY KEY, 46 | column_name2 datatype NOT NULL, 47 | ... 48 | ); 49 | ``` 50 | 51 | ### Inserting Data 52 | 53 | ``` 54 | INSERT INTO table_name (column1, column2, ...) 55 | VALUES (value1, value2, ...); 56 | ``` 57 | 58 | ### Querying Data 59 | 60 | ``` 61 | SELECT column1, column2, ... 62 | FROM table_name 63 | WHERE condition; 64 | ``` 65 | 66 | ### Updating Data 67 | 68 | ``` 69 | UPDATE table_name 70 | SET column1 = value1, column2 = value2, ... 71 | WHERE condition; 72 | ``` 73 | 74 | ### Deleting Data 75 | 76 | ``` 77 | DELETE FROM table_name 78 | WHERE condition; 79 | ``` 80 | 81 | ### Dropping Tables 82 | 83 | ``` 84 | DROP TABLE table_name; 85 | ``` 86 | 87 | ## Administration and Management 88 | 89 | ### pgAdmin 90 | A popular, open-source graphical administration tool for PostgreSQL 91 | 92 | ### Command-Line Client 93 | 94 | A text-based interface for executing SQL queries and managing databases (e.g., psql) 95 | 96 | ### Performance Tuning 97 | 98 | PostgreSQL provides various configuration options for optimizing performance 99 | 100 | ### Backup and Recovery 101 | 102 | Supports logical and physical backups using tools like pg_dump and pg_basebackup 103 | 104 | ### Monitoring 105 | 106 | Built-in statistics collector for monitoring and diagnosing performance issues 107 | 108 | ## Use Cases 109 | - Common choice for web applications due to its flexibility and extensibility 110 | - Widely used in GIS applications due to PostGIS support 111 | - Suitable for data warehousing and analytical processing workloads 112 | - Ideal for large-scale enterprise applications requiring a robust and feature-rich RDBMS 113 | 114 | ## ENgine 115 | 116 | PostgreSQL uses a single, unified storage engine. However, PostgreSQL provides a rich and flexible architecture for handling data and offers many advanced features. Unlike MySQL, which uses multiple storage engines, PostgreSQL uses a unified engine but provides mechanisms to customize storage and indexing behaviors. 117 | 118 | ### **Key Features of PostgreSQL’s Storage System** 119 | 120 | #### 1. **Unified Storage Engine** 121 | - PostgreSQL uses a single, robust storage engine for all operations, ensuring ACID compliance and high performance across a variety of workloads. 122 | 123 | --- 124 | 125 | #### 2. **Table Storage Models** 126 | - PostgreSQL organizes data in tables using a row-based storage model. 127 | - **Heap Storage:** 128 | - Default storage model for tables. 129 | - Data is stored in no particular order, and MVCC (Multiversion Concurrency Control) ensures consistency without locking reads. 130 | 131 | --- 132 | 133 | #### 3. **MVCC (Multiversion Concurrency Control)** 134 | - PostgreSQL uses MVCC to handle transactions and concurrency. 135 | - Instead of locking rows, it creates multiple versions of rows to ensure consistency. 136 | - MVCC supports advanced features like: 137 | - Point-in-time recovery (PITR). 138 | - Non-blocking reads during write operations. 139 | 140 | --- 141 | 142 | #### 4. **Tablespaces** 143 | - PostgreSQL supports tablespaces, allowing users to control where data files are stored on disk. 144 | - Useful for optimizing storage performance and managing large-scale data systems. 145 | 146 | --- 147 | 148 | #### 5. **TOAST (The Oversized-Attribute Storage Technique)** 149 | - PostgreSQL can handle large data fields like blobs, JSON, or XML efficiently using TOAST. 150 | - Automatically stores large column data externally and references it in the main table. 151 | 152 | --- 153 | 154 | ### **Indexing Options in PostgreSQL** 155 | PostgreSQL supports a variety of indexing methods, allowing customization for different workloads: 156 | 157 | 1. **B-Tree:** 158 | - Default index type, ideal for most general-purpose queries. 159 | 2. **Hash:** 160 | - Optimized for equality lookups (e.g., `=` or `IN`). 161 | 3. **GIN (Generalized Inverted Index):** 162 | - Efficient for full-text searches and indexing JSON/array fields. 163 | 4. **GiST (Generalized Search Tree):** 164 | - Useful for spatial data, geometric searches, and full-text indexing. 165 | 5. **BRIN (Block Range Index):** 166 | - Optimized for large, sequentially stored data sets like time-series data. 167 | 6. **SP-GiST (Space Partitioned GiST):** 168 | - Efficient for non-balanced tree structures like quadtrees or k-d trees. 169 | 7. **Bloom Filters:** 170 | - Space-efficient, probabilistic data structures for certain query types. 171 | 172 | --- 173 | 174 | ### **Advanced Features in PostgreSQL** 175 | 176 | #### 1. **Partitioning** 177 | - PostgreSQL supports declarative partitioning (range, list, and hash) to optimize large data sets by dividing them into smaller, manageable parts. 178 | 179 | #### 2. **Foreign Data Wrappers (FDW)** 180 | - Allows PostgreSQL to interact with external data sources (e.g., other databases, files) as if they were local tables. 181 | 182 | #### 3. **Custom Data Types** 183 | - PostgreSQL allows users to define their own data types, providing flexibility for domain-specific applications. 184 | 185 | #### 4. **JSON and JSONB Support** 186 | - PostgreSQL has robust support for semi-structured data through JSON and JSONB. 187 | - JSONB (binary JSON) provides efficient indexing and querying capabilities. 188 | 189 | #### 5. **Full-Text Search** 190 | - Built-in full-text search capabilities enable efficient querying of textual data. 191 | 192 | #### 6. **PL/pgSQL and Other Procedural Languages** 193 | - PostgreSQL supports embedded procedural languages (e.g., PL/pgSQL, PL/Python, PL/Perl), allowing complex application logic to run within the database. 194 | 195 | --- 196 | 197 | ### **Storage Customization in PostgreSQL** 198 | 199 | - While PostgreSQL doesn’t use multiple storage engines like MySQL, it offers advanced configuration and extension capabilities to tailor the database to specific needs: 200 | - Extensions like `pg_partman` for partition management. 201 | - `TimescaleDB` for time-series data. 202 | - `PostGIS` for geographic and spatial data. 203 | 204 | ### **Comparison to MySQL** 205 | - **Single Unified Engine vs. Multiple Engines:** 206 | - PostgreSQL has a unified engine with deep extensibility, while MySQL offers multiple engines (e.g., InnoDB, MyISAM). 207 | - **Concurrency:** PostgreSQL’s MVCC implementation often outperforms MySQL for high-concurrency scenarios. 208 | - **Extensibility:** PostgreSQL supports custom data types, extensions, and advanced indexing, making it more flexible for complex applications. 209 | - **ACID Compliance:** PostgreSQL is fully ACID-compliant by default, whereas MySQL depends on the storage engine (e.g., InnoDB is ACID-compliant, MyISAM is not). 210 | 211 | -------------------------------------------------------------------------------- /notes/13_big_data/01_data_warehousing.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /notes/13_big_data/02_hadoop_and_hdfs.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /notes/13_big_data/03_spark_sql.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /notes/14_orm/01_introduction_to_orm.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /notes/14_orm/02_popular_orm_tools.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /scripts/concurrency/concurrent_readers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sqlite3 3 | import multiprocessing 4 | import argparse 5 | import os 6 | import time 7 | 8 | DB = 'demo.sqlite' 9 | 10 | def setup_database(): 11 | """Create the database and test table.""" 12 | if os.path.exists(DB): 13 | os.remove(DB) 14 | conn = sqlite3.connect(DB) 15 | conn.execute(""" 16 | CREATE TABLE test ( 17 | id INTEGER PRIMARY KEY, 18 | value TEXT 19 | ); 20 | """) 21 | conn.execute("INSERT INTO test (id, value) VALUES (1, 'initial');") 22 | conn.commit() 23 | conn.close() 24 | print("[Setup] Initialized database with one row.\n") 25 | 26 | def writer(exclusive_mode): 27 | """Begin a long transaction as writer.""" 28 | conn = sqlite3.connect(DB, isolation_level=None, timeout=30) 29 | cur = conn.cursor() 30 | if exclusive_mode: 31 | # Force all locks to be exclusive; readers will block 32 | conn.execute("PRAGMA locking_mode = EXCLUSIVE;") 33 | conn.execute("PRAGMA journal_mode = DELETE;") 34 | print("[Writer] PRAGMA locking_mode=EXCLUSIVE; journal_mode=DELETE") 35 | conn.execute("BEGIN EXCLUSIVE;") 36 | else: 37 | # WAL gives concurrent readers during writes 38 | conn.execute("PRAGMA journal_mode = WAL;") 39 | conn.execute("PRAGMA wal_autocheckpoint = 1000;") 40 | print("[Writer] PRAGMA journal_mode=WAL; wal_autocheckpoint=1000") 41 | conn.execute("BEGIN;") 42 | print(f"[Writer] Started transaction; updating row...") 43 | conn.execute("UPDATE test SET value = ? WHERE id = 1;", 44 | ('written-'+('X' if exclusive_mode else 'W'),)) 45 | print(f"[Writer] Update done, sleeping for 5s to hold lock...") 46 | time.sleep(5) 47 | conn.commit() 48 | print("[Writer] Committed and released locks.\n") 49 | conn.close() 50 | 51 | def reader(name): 52 | """Attempt to read the single row.""" 53 | print(f"[{name}] Attempting to read...") 54 | conn = sqlite3.connect(DB, timeout=30) 55 | try: 56 | cur = conn.execute("SELECT value FROM test WHERE id = 1;") 57 | val = cur.fetchone()[0] 58 | print(f"[{name}] Read value = {val!r}") 59 | except sqlite3.OperationalError as e: 60 | print(f"[{name}] ERROR: {e}") 61 | finally: 62 | conn.close() 63 | 64 | if __name__ == '__main__': 65 | parser = argparse.ArgumentParser( 66 | description="Demo: Exclusive lock vs. WAL concurrency in SQLite" 67 | ) 68 | parser.add_argument( 69 | '--exclusive', action='store_true', 70 | help='If set, writer uses EXCLUSIVE locking (readers will block)' 71 | ) 72 | args = parser.parse_args() 73 | 74 | setup_database() 75 | 76 | # Start the writer process 77 | w = multiprocessing.Process(target=writer, args=(args.exclusive,)) 78 | w.start() 79 | 80 | # Give writer a moment to acquire its lock 81 | time.sleep(1) 82 | 83 | # Start a few readers concurrently 84 | readers = [] 85 | for i in range(3): 86 | p = multiprocessing.Process(target=reader, args=(f"Reader#{i+1}",)) 87 | readers.append(p) 88 | p.start() 89 | time.sleep(0.2) # stagger the readers slightly 90 | 91 | # Wait for all to finish 92 | for p in readers: 93 | p.join() 94 | w.join() 95 | 96 | # Final read after writer is done 97 | print("\n[Main] Final read after writer commits:") 98 | reader("FinalReader") 99 | -------------------------------------------------------------------------------- /scripts/concurrency/deadlock_file_level.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sqlite3 3 | import multiprocessing 4 | import argparse 5 | import os 6 | import time 7 | 8 | DB1 = 'db1.sqlite' 9 | DB2 = 'db2.sqlite' 10 | 11 | 12 | def setup_databases(): 13 | """Create two databases with the same schema and one row each.""" 14 | for db in (DB1, DB2): 15 | try: 16 | os.remove(db) 17 | except FileNotFoundError: 18 | pass 19 | conn = sqlite3.connect(db) 20 | conn.execute(""" 21 | CREATE TABLE IF NOT EXISTS test ( 22 | id INTEGER PRIMARY KEY, 23 | value TEXT 24 | ); 25 | """) 26 | conn.execute("DELETE FROM test;") 27 | conn.execute("INSERT INTO test (id, value) VALUES (?, ?);", (1, 'init')) 28 | conn.commit() 29 | conn.close() 30 | print("Databases initialized.\n") 31 | 32 | 33 | def worker(name, first_db, second_db, evt_first_done, evt_second_done, deadlock_mode): 34 | """Each worker locks first_db then second_db.""" 35 | # very large timeout = effectively infinite; small timeout = quick exception 36 | busy_ms = 10 ** 9 if deadlock_mode else 100 37 | conn = sqlite3.connect(DB1, timeout=30, isolation_level=None) 38 | conn.execute(f"PRAGMA busy_timeout = {busy_ms};") 39 | conn.execute("ATTACH DATABASE ? AS db2;", (DB2,)) 40 | try: 41 | # always start DEFERRED 42 | conn.execute("BEGIN DEFERRED;") 43 | print(f"[{name}] BEGIN DEFERRED on {first_db}") 44 | conn.execute(f"UPDATE {first_db}.test SET value='{name}-step1' WHERE id=1;") 45 | print(f"[{name}] Locked {first_db} and updated step1") 46 | evt_first_done.set() # signal the other 47 | print(f"[{name}] Waiting for other to lock {second_db}...") 48 | evt_second_done.wait() 49 | print(f"[{name}] Now trying to update {second_db}.test (step2)") 50 | 51 | # attempt second update (may block or raise) 52 | conn.execute(f"UPDATE {second_db}.test SET value='{name}-step2' WHERE id=1;") 53 | conn.commit() 54 | print(f"[{name}] Committed both updates successfully!\n") 55 | 56 | except sqlite3.OperationalError as e: 57 | print(f"[{name}] OperationalError: {e}") 58 | if not deadlock_mode: 59 | # handle the “deadlock/busy” by rolling back and retrying immediately 60 | print(f"[{name}] Handling lock contention: rolling back and retrying with IMMEDIATE transaction") 61 | conn.rollback() 62 | time.sleep(0.1) 63 | conn.execute("BEGIN IMMEDIATE;") 64 | conn.execute(f"UPDATE {first_db}.test SET value='{name}-retry1' WHERE id=1;") 65 | conn.execute(f"UPDATE {second_db}.test SET value='{name}-retry2' WHERE id=1;") 66 | conn.commit() 67 | print(f"[{name}] Retry succeeded under IMMEDIATE mode!\n") 68 | finally: 69 | conn.close() 70 | 71 | 72 | if __name__ == '__main__': 73 | parser = argparse.ArgumentParser( 74 | description="Demonstrate SQLite deadlock vs. handling" 75 | ) 76 | parser.add_argument( 77 | '--deadlock', action='store_true', 78 | help='If set, simulate an indefinite deadlock (DEFERRED + infinite timeout)' 79 | ) 80 | args = parser.parse_args() 81 | 82 | setup_databases() 83 | 84 | # events for sync 85 | e1 = multiprocessing.Event() 86 | e2 = multiprocessing.Event() 87 | 88 | # worker1 locks main then db2 89 | p1 = multiprocessing.Process( 90 | target=worker, 91 | args=('W1', 'main', 'db2', e1, e2, args.deadlock) 92 | ) 93 | # worker2 locks db2 then main 94 | p2 = multiprocessing.Process( 95 | target=worker, 96 | args=('W2', 'db2', 'main', e2, e1, args.deadlock) 97 | ) 98 | 99 | p1.start() 100 | p2.start() 101 | p1.join() 102 | p2.join() 103 | 104 | # show final state 105 | for db in (DB1, DB2): 106 | conn = sqlite3.connect(db) 107 | val = conn.execute("SELECT value FROM test WHERE id=1;").fetchone()[0] 108 | conn.close() 109 | print(f"Final {db}.test.value = {val!r}") 110 | -------------------------------------------------------------------------------- /scripts/concurrency/deadlock_row_level.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import time 3 | import psycopg2 4 | from psycopg2 import sql, OperationalError, errors 5 | 6 | DSN = "dbname=test user=postgres password=secret host=localhost port=5432" 7 | 8 | def setup_db(): 9 | with psycopg2.connect(DSN) as conn: 10 | with conn.cursor() as cur: 11 | cur.execute(""" 12 | DROP TABLE IF EXISTS deadlock_demo; 13 | CREATE TABLE deadlock_demo ( 14 | id SERIAL PRIMARY KEY, 15 | val TEXT 16 | ); 17 | INSERT INTO deadlock_demo (val) VALUES ('A'), ('B'); 18 | """) 19 | conn.commit() 20 | print("✅ Table created and initialized.") 21 | 22 | def cleanup_db(): 23 | with psycopg2.connect(DSN) as conn: 24 | with conn.cursor() as cur: 25 | cur.execute("DROP TABLE IF EXISTS deadlock_demo;") 26 | conn.commit() 27 | print("🧹 Table dropped, cleanup complete.") 28 | 29 | def worker(name, first_id, second_id, delay): 30 | conn = psycopg2.connect(DSN) 31 | conn.autocommit = False 32 | cur = conn.cursor() 33 | try: 34 | cur.execute("BEGIN;") 35 | # Lock the first row 36 | cur.execute( 37 | sql.SQL("SELECT val FROM deadlock_demo WHERE id = %s FOR UPDATE;"), 38 | [first_id] 39 | ) 40 | print(f"{name}: locked row {first_id}") 41 | 42 | time.sleep(delay) 43 | 44 | # Now try to lock the second row 45 | print(f"{name}: attempting to lock row {second_id}") 46 | cur.execute( 47 | sql.SQL("SELECT val FROM deadlock_demo WHERE id = %s FOR UPDATE;"), 48 | [second_id] 49 | ) 50 | print(f"{name}: locked row {second_id} — no deadlock?") 51 | 52 | conn.commit() 53 | except OperationalError as e: 54 | # Psycopg2 raises a generic OperationalError for deadlocks 55 | print(f"{name}: DEADLOCK detected! → {e}") 56 | conn.rollback() 57 | finally: 58 | cur.close() 59 | conn.close() 60 | 61 | if __name__ == "__main__": 62 | setup_db() 63 | 64 | # Thread A: locks row 1 then row 2 65 | t1 = threading.Thread(target=worker, args=("Thread-A", 1, 2, 1)) 66 | # Thread B: locks row 2 then row 1 67 | t2 = threading.Thread(target=worker, args=("Thread-B", 2, 1, 1)) 68 | 69 | t1.start() 70 | t2.start() 71 | t1.join() 72 | t2.join() 73 | 74 | cleanup_db() 75 | print("🏁 Demo complete.") 76 | -------------------------------------------------------------------------------- /scripts/concurrency/mvcc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sqlite3, time, uuid, os 3 | from multiprocessing import Process 4 | 5 | DB = "mvcc.db" 6 | TBL = "products_versioned" 7 | SENTINEL = 1e12 # "infinite" timestamp 8 | 9 | def setup(): 10 | if os.path.exists(DB): 11 | os.remove(DB) 12 | with sqlite3.connect(DB) as conn: 13 | c = conn.cursor() 14 | c.execute(f""" 15 | CREATE TABLE {TBL} ( 16 | id INTEGER, 17 | name TEXT, 18 | quantity INTEGER, 19 | version_id TEXT, 20 | valid_from REAL, 21 | valid_to REAL DEFAULT {SENTINEL}, 22 | PRIMARY KEY(id, version_id) 23 | ); 24 | """) 25 | now = time.time() 26 | c.execute(f""" 27 | INSERT INTO {TBL}(id,name,quantity,version_id,valid_from) 28 | VALUES(1,'Widget',100,?,?); 29 | """, (str(uuid.uuid4()), now)) 30 | conn.commit() 31 | 32 | def transaction(label, delta, sleep_secs): 33 | conn = sqlite3.connect(DB, isolation_level=None) # autocommit off 34 | c = conn.cursor() 35 | # 1) start a snapshot 36 | c.execute("BEGIN;") 37 | start_ts = time.time() 38 | print(f"[{label}] BEGIN at {start_ts:.3f}") 39 | 40 | # 2) read 41 | c.execute(f""" 42 | SELECT quantity, version_id 43 | FROM {TBL} 44 | WHERE id=1 AND valid_from <= ? AND valid_to > ? 45 | ORDER BY valid_from DESC LIMIT 1; 46 | """, (start_ts, start_ts)) 47 | row = c.fetchone() 48 | if not row: 49 | print(f"[{label}] nothing to read!") 50 | c.execute("ROLLBACK;") 51 | return 52 | qty, vid = row 53 | print(f"[{label}] read qty={qty} vid={vid}") 54 | 55 | time.sleep(sleep_secs) # simulate work 56 | 57 | # 3) attempt to expire & insert new version 58 | now = time.time() 59 | # expire *only if* version_id still matches, otherwise conflict 60 | updated = c.execute(f""" 61 | UPDATE {TBL} 62 | SET valid_to=? 63 | WHERE id=1 64 | AND version_id=? 65 | AND valid_to={SENTINEL}; 66 | """, (now, vid)).rowcount 67 | 68 | if updated != 1: 69 | print(f"[{label}] ABORT: concurrent write detected") 70 | c.execute("ROLLBACK;") 71 | return 72 | 73 | new_qty = qty + delta 74 | c.execute(f""" 75 | INSERT INTO {TBL}(id,name,quantity,version_id,valid_from) 76 | VALUES(1,'Widget',?, ?, ?); 77 | """, (new_qty, str(uuid.uuid4()), now)) 78 | c.execute("COMMIT;") 79 | print(f"[{label}] COMMIT new_qty={new_qty}") 80 | 81 | def show_versions(): 82 | with sqlite3.connect(DB) as conn: 83 | for row in conn.execute(f""" 84 | SELECT version_id, quantity, valid_from, valid_to 85 | FROM {TBL} ORDER BY valid_from; 86 | """): 87 | print(row) 88 | 89 | if __name__ == "__main__": 90 | setup() 91 | # two processes, they’ll run concurrently 92 | pA = Process(target=transaction, args=("A", +50, 2)) 93 | pB = Process(target=transaction, args=("B", -30, 1)) 94 | pA.start(); pB.start() 95 | pA.join(); pB.join() 96 | 97 | print("\nAll versions:") 98 | show_versions() 99 | -------------------------------------------------------------------------------- /scripts/concurrency/optimistic_vs_pesimistic_lock.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sqlite3 3 | import time 4 | import uuid 5 | import os 6 | from multiprocessing import Process 7 | 8 | # Constants 9 | DATABASE = 'lock_demo.db' 10 | MAX_RETRIES = 3 11 | RETRY_DELAY = 1 12 | 13 | class VersionConflict(Exception): 14 | pass 15 | 16 | def setup_database(): 17 | """Create the products table and initialize one row.""" 18 | if os.path.exists(DATABASE): 19 | os.remove(DATABASE) 20 | with sqlite3.connect(DATABASE) as conn: 21 | # Use WAL so readers aren’t blocked during pessimistic locks 22 | conn.execute("PRAGMA journal_mode = WAL;") 23 | conn.execute(""" 24 | CREATE TABLE products ( 25 | id INTEGER PRIMARY KEY, 26 | name TEXT, 27 | quantity INTEGER, 28 | version INTEGER 29 | ); 30 | """) 31 | conn.execute(""" 32 | INSERT INTO products (id, name, quantity, version) 33 | VALUES (1, 'Widget', 100, 1); 34 | """) 35 | print("[Setup] Database initialized.\n") 36 | 37 | def pessimistic_transaction(name, delay_before_update=2): 38 | """Pessimistic: BEGIN IMMEDIATE to acquire RESERVED lock up-front.""" 39 | try: 40 | with sqlite3.connect(DATABASE, timeout=10) as conn: 41 | conn.execute("PRAGMA journal_mode = WAL;") 42 | cur = conn.cursor() 43 | print(f"{name}: BEGIN IMMEDIATE") 44 | cur.execute("BEGIN IMMEDIATE;") 45 | cur.execute("SELECT quantity FROM products WHERE id = 1;") 46 | qty = cur.fetchone()[0] 47 | print(f"{name}: read quantity = {qty}") 48 | time.sleep(delay_before_update) 49 | new_qty = qty - 10 50 | cur.execute("UPDATE products SET quantity = ? WHERE id = 1;", (new_qty,)) 51 | print(f"{name}: updated quantity to {new_qty}") 52 | print(f"{name}: COMMIT\n") 53 | except sqlite3.OperationalError as e: 54 | print(f"{name}: OperationalError - {e}\n") 55 | 56 | def optimistic_transaction(name, delay_before_update=2): 57 | """Optimistic: read-version, check-and-set, retry on conflict.""" 58 | for attempt in range(1, MAX_RETRIES + 1): 59 | try: 60 | with sqlite3.connect(DATABASE, timeout=5) as conn: 61 | conn.execute("PRAGMA journal_mode = WAL;") 62 | cur = conn.cursor() 63 | print(f"{name}: BEGIN (attempt {attempt})") 64 | cur.execute("BEGIN;") 65 | cur.execute("SELECT quantity, version FROM products WHERE id = 1;") 66 | row = cur.fetchone() 67 | if row is None: 68 | print(f"{name}: no row found, aborting\n") 69 | return 70 | qty, ver = row 71 | print(f"{name}: read quantity = {qty}, version = {ver}") 72 | time.sleep(delay_before_update) 73 | new_qty = qty - 10 74 | new_ver = ver + 1 75 | cur.execute(""" 76 | UPDATE products 77 | SET quantity = ?, version = ? 78 | WHERE id = 1 AND version = ?; 79 | """, (new_qty, new_ver, ver)) 80 | if cur.rowcount == 0: 81 | raise VersionConflict() 82 | print(f"{name}: updated to quantity = {new_qty}, version = {new_ver}") 83 | print(f"{name}: COMMIT\n") 84 | return 85 | except VersionConflict: 86 | print(f"{name}: version conflict, retrying after {RETRY_DELAY}s...\n") 87 | time.sleep(RETRY_DELAY) 88 | except sqlite3.OperationalError as e: 89 | print(f"{name}: OperationalError - {e}\n") 90 | return 91 | print(f"{name}: failed after {MAX_RETRIES} attempts\n") 92 | 93 | def display_final_state(): 94 | """Show the final quantity and version.""" 95 | with sqlite3.connect(DATABASE) as conn: 96 | cur = conn.cursor() 97 | cur.execute("SELECT quantity, version FROM products WHERE id = 1;") 98 | row = cur.fetchone() 99 | if row: 100 | qty, ver = row 101 | print(f"[Final State] quantity = {qty}, version = {ver}\n") 102 | else: 103 | print("[Final State] Product not found\n") 104 | 105 | if __name__ == '__main__': 106 | # Pessimistic locking demo 107 | setup_database() 108 | print("--- Pessimistic Locking Demo ---") 109 | p1 = Process(target=pessimistic_transaction, args=("ProcA", 4)) 110 | p2 = Process(target=pessimistic_transaction, args=("ProcB", 0)) 111 | p1.start() 112 | time.sleep(0.5) # ProcA grabs RESERVED lock first 113 | p2.start() 114 | p1.join() 115 | p2.join() 116 | display_final_state() 117 | 118 | # Optimistic locking demo 119 | setup_database() 120 | print("--- Optimistic Locking Demo ---") 121 | o1 = Process(target=optimistic_transaction, args=("ProcX", 3)) 122 | o2 = Process(target=optimistic_transaction, args=("ProcY", 1)) 123 | o1.start() 124 | time.sleep(0.5) 125 | o2.start() 126 | o1.join() 127 | o2.join() 128 | display_final_state() 129 | -------------------------------------------------------------------------------- /scripts/concurrency/transaction_isolation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sqlite3, threading, time, os 3 | 4 | DB = 'isolation_demo.db' 5 | 6 | def cleanup(): 7 | # Remove main DB and any WAL/SHM files 8 | for suffix in ('', '-wal', '-shm'): 9 | try: 10 | os.remove(DB + suffix) 11 | except FileNotFoundError: 12 | pass 13 | 14 | def setup_db_journal(journal): 15 | """Initialize a fresh DB with the given journal_mode and quantity=100.""" 16 | cleanup() 17 | conn = sqlite3.connect(DB) 18 | # Set journal mode (WAL in both demos) 19 | conn.execute(f"PRAGMA journal_mode = {journal};") 20 | conn.execute("DROP TABLE IF EXISTS products;") 21 | conn.execute(""" 22 | CREATE TABLE products ( 23 | id INTEGER PRIMARY KEY, 24 | quantity INTEGER 25 | ); 26 | """) 27 | conn.execute("INSERT INTO products (id, quantity) VALUES (1, 100);") 28 | conn.commit() 29 | conn.close() 30 | print(f"\n[Setup: journal_mode={journal}] initialized quantity=100") 31 | 32 | def isolation_demo(): 33 | print("\n--- Isolation Demo (WAL): NO dirty reads ---") 34 | setup_db_journal('WAL') 35 | 36 | def writer(): 37 | conn = sqlite3.connect(DB, isolation_level=None) 38 | cur = conn.cursor() 39 | # Deferred transaction: snapshot taken at BEGIN 40 | cur.execute("BEGIN;") 41 | cur.execute("UPDATE products SET quantity = 200 WHERE id = 1;") 42 | print("[Writer] updated to 200 but NOT yet committed") 43 | time.sleep(2) 44 | conn.commit() 45 | print("[Writer] committed") 46 | conn.close() 47 | 48 | def reader(): 49 | conn = sqlite3.connect(DB, isolation_level=None) 50 | cur = conn.cursor() 51 | # Start snapshot BEFORE writer updates 52 | cur.execute("BEGIN;") 53 | print("[Reader] began snapshot transaction") 54 | time.sleep(3) 55 | qty = cur.execute("SELECT quantity FROM products WHERE id = 1;").fetchone()[0] 56 | print(f"[Reader] read quantity = {qty} (should be 100)") 57 | conn.rollback() # end the transaction 58 | conn.close() 59 | 60 | t_r = threading.Thread(target=reader) 61 | t_w = threading.Thread(target=writer) 62 | 63 | t_r.start() 64 | time.sleep(1) # ensure reader has its snapshot open 65 | t_w.start() 66 | 67 | t_w.join() 68 | t_r.join() 69 | 70 | final_qty = sqlite3.connect(DB).execute( 71 | "SELECT quantity FROM products WHERE id = 1;" 72 | ).fetchone()[0] 73 | print(f"[Final State] quantity = {final_qty}") 74 | 75 | def dirty_read_demo(): 76 | print("\n--- Dirty-Read Demo (WAL + shared cache): ALLOW dirty reads ---") 77 | setup_db_journal('WAL') 78 | 79 | # Use a URI to enable shared cache 80 | uri = f'file:{DB}?cache=shared' 81 | 82 | def writer(): 83 | conn = sqlite3.connect(uri, uri=True, isolation_level=None) 84 | cur = conn.cursor() 85 | cur.execute("BEGIN;") 86 | cur.execute("UPDATE products SET quantity = 200 WHERE id = 1;") 87 | print("[Writer] updated to 200 but NOT yet committed") 88 | time.sleep(2) 89 | conn.commit() 90 | print("[Writer] committed") 91 | conn.close() 92 | 93 | def reader(): 94 | conn = sqlite3.connect(uri, uri=True, isolation_level=None) 95 | cur = conn.cursor() 96 | # Allow dirty reads 97 | cur.execute("PRAGMA read_uncommitted = 1;") 98 | time.sleep(1) 99 | qty = cur.execute("SELECT quantity FROM products WHERE id = 1;").fetchone()[0] 100 | print(f"[Reader] read quantity = {qty} (dirty read of 200)") 101 | conn.close() 102 | 103 | t_w = threading.Thread(target=writer) 104 | t_r = threading.Thread(target=reader) 105 | 106 | t_w.start() 107 | t_r.start() 108 | 109 | t_w.join() 110 | t_r.join() 111 | 112 | final_qty = sqlite3.connect(DB).execute( 113 | "SELECT quantity FROM products WHERE id = 1;" 114 | ).fetchone()[0] 115 | print(f"[Final State] quantity = {final_qty}") 116 | 117 | if __name__ == "__main__": 118 | isolation_demo() 119 | dirty_read_demo() 120 | print("\nDone.") 121 | -------------------------------------------------------------------------------- /scripts/create_mock_db.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | from datetime import datetime 3 | import os 4 | 5 | def create_connection(db_file): 6 | """ Create a database connection to a SQLite database """ 7 | conn = None 8 | try: 9 | conn = sqlite3.connect(db_file) 10 | print(f"Creating or reading database at: {os.path.abspath(db_file)}") 11 | return conn 12 | except sqlite3.Error as e: 13 | print(e) 14 | 15 | return conn 16 | 17 | def create_table(conn): 18 | """ Create a table if it does not exist """ 19 | try: 20 | sql_create_table = """ CREATE TABLE IF NOT EXISTS example_table ( 21 | id integer PRIMARY KEY AUTOINCREMENT, 22 | timestamp text NOT NULL 23 | ); """ 24 | cursor = conn.cursor() 25 | cursor.execute(sql_create_table) 26 | print("Table 'example_table' created or already exists.") 27 | except sqlite3.Error as e: 28 | print(e) 29 | 30 | def append_rows(conn, num_rows=1): 31 | """ Append rows to the table """ 32 | try: 33 | cursor = conn.cursor() 34 | for _ in range(num_rows): 35 | timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 36 | sql_insert_row = """ INSERT INTO example_table(timestamp) 37 | VALUES(?) """ 38 | cursor.execute(sql_insert_row, (timestamp,)) 39 | print(f"Row added with timestamp: {timestamp}") 40 | conn.commit() 41 | except sqlite3.Error as e: 42 | print(e) 43 | 44 | def main(): 45 | database = "database.db" 46 | 47 | # create a database connection 48 | conn = create_connection(database) 49 | if conn is not None: 50 | # create table 51 | create_table(conn) 52 | 53 | # append rows 54 | append_rows(conn, num_rows=100000) # Change num_rows as needed 55 | 56 | conn.close() 57 | else: 58 | print("Error! cannot create the database connection.") 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /scripts/diagrams/hash_ring.py: -------------------------------------------------------------------------------- 1 | import math 2 | import matplotlib 3 | matplotlib.use("Agg") # headless backend 4 | import matplotlib.pyplot as plt 5 | 6 | # ----------------- helper utilities ------------------------------------- 7 | def deg2rad(deg: float) -> float: 8 | return deg * math.pi / 180.0 9 | 10 | 11 | def draw_ring(nodes, data, title, outfile): 12 | fig = plt.figure(figsize=(5, 5)) 13 | ax = fig.add_subplot(111, polar=True) 14 | 15 | # Dashed circumference 16 | circle_theta = [i * math.pi / 180.0 for i in range(361)] 17 | ax.plot(circle_theta, [1.0] * len(circle_theta), 18 | linestyle="--", linewidth=0.7, color="#888888", zorder=0) 19 | 20 | # Sorted node angles for boundary "pizza slices" 21 | node_angles = sorted(nodes.values()) 22 | for deg in node_angles: 23 | rad = deg2rad(deg) 24 | ax.plot([rad, rad], [0, 1], 25 | color="#bbbbbb", linewidth=0.8, zorder=0) 26 | 27 | # Node markers + labels 28 | for name, deg in nodes.items(): 29 | rad = deg2rad(deg) 30 | ax.scatter(rad, 1, marker="o", s=90, zorder=3) 31 | ax.text(rad, 1.22, name, ha="center", va="center", 32 | fontsize=9, weight="bold", zorder=3) 33 | 34 | # Data key markers + labels 35 | if data: 36 | for key, deg in data.items(): 37 | rad = deg2rad(deg) 38 | ax.scatter(rad, 1, marker="^", s=70, zorder=3) 39 | ax.text(rad, 0.88, key, ha="center", va="center", 40 | fontsize=8, zorder=3) 41 | 42 | # Aesthetics 43 | ax.set_rticks([]) 44 | ax.set_theta_zero_location("N") 45 | ax.set_theta_direction(-1) 46 | ax.grid(False) 47 | 48 | # Manual title placement to avoid overlap 49 | fig.text(0.02, 0.95, title, ha="left", va="top", 50 | fontsize=11, weight="bold") 51 | 52 | fig.tight_layout() 53 | fig.savefig(outfile, dpi=120) 54 | plt.close(fig) 55 | 56 | 57 | # ----------------- scenarios -------------------------------------------- 58 | nodes_abc = {"Node A": 0, "Node B": 120, "Node C": 240} 59 | nodes_add_d = {"Node A": 0, "Node D": 80, "Node B": 120, "Node C": 240} 60 | nodes_no_b = {"Node A": 0, "Node D": 80, "Node C": 240} 61 | 62 | # Consistent key set 63 | data_keys = {"K1": 100, "K2": 200, "K3": 330} 64 | 65 | draw_ring(nodes_abc, None, "Nodes A, B, C", "ring_nodes.png") 66 | draw_ring(nodes_abc, data_keys, "Nodes + Data Keys", "ring_nodes_data.png") 67 | draw_ring(nodes_add_d, data_keys, "Add Node D","ring_add_node_d.png") 68 | draw_ring(nodes_no_b, data_keys, "Remove Node B", "ring_remove_node_b.png") 69 | 70 | print("✓ Diagrams generated:") 71 | for png in ("ring_nodes.png", "ring_nodes_data.png", 72 | "ring_add_node_d.png", "ring_remove_node_b.png"): 73 | print(" └─ ", png) 74 | -------------------------------------------------------------------------------- /scripts/generating_query_strings/create_table.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | def generate_create_table_query(table_name: str, columns: Dict[str, str]) -> str: 4 | """ 5 | Generate a SQL CREATE TABLE query for a given table name and column definitions. 6 | 7 | Args: 8 | table_name (str): The name of the table to create. 9 | columns (Dict[str, str]): A dictionary where keys are column names and values are their data types. 10 | 11 | Returns: 12 | str: A SQL CREATE TABLE query string. 13 | 14 | Example: 15 | >>> generate_create_table_query( 16 | "students", 17 | {"id": "INT PRIMARY KEY", "name": "VARCHAR(100)", "age": "INT", "email": "VARCHAR(100)"} 18 | ) 19 | "CREATE TABLE students (id INT PRIMARY KEY, name VARCHAR(100), age INT, email VARCHAR(100));" 20 | """ 21 | # Preparing the column definitions part of the query 22 | columns_clause = ", ".join(f"{column} {data_type}" for column, data_type in columns.items()) 23 | 24 | query = f"CREATE TABLE {table_name} ({columns_clause});" 25 | 26 | return query 27 | 28 | if __name__ == "__main__": 29 | table_columns = { 30 | "id": "INT PRIMARY KEY", 31 | "name": "VARCHAR(100)", 32 | "age": "INT", 33 | "email": "VARCHAR(100)" 34 | } 35 | 36 | # Expected output: 37 | # CREATE TABLE students (id INT PRIMARY KEY, name VARCHAR(100), age INT, email VARCHAR(100)); 38 | print(generate_create_table_query("students", table_columns)) 39 | -------------------------------------------------------------------------------- /scripts/generating_query_strings/delete.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | def generate_delete_query(table_name: str, conditions: List[Tuple[str, str]]) -> str: 4 | """ 5 | Generate a SQL DELETE query for a given table name and conditions. 6 | 7 | Args: 8 | table_name (str): The name of the table from which to delete records. 9 | conditions (List[Tuple[str, str]]): A list of conditions for the deletion, each represented as a (column, value) tuple. 10 | 11 | Returns: 12 | str: A SQL DELETE query string. 13 | 14 | Example: 15 | >>> generate_delete_query("students", [("name", "John Doe"), ("age", "20")]) 16 | "DELETE FROM students WHERE name = 'John Doe' AND age = '20';" 17 | """ 18 | # Preparing the WHERE part of the query 19 | where_clause = " AND ".join(f"{column} = '{value.replace(\"'\", \"''\")}'" for column, value in conditions) 20 | 21 | query = f"DELETE FROM {table_name} WHERE {where_clause};" 22 | 23 | return query 24 | 25 | if __name__ == "__main__": 26 | delete_conditions = [ 27 | ("name", "John Doe"), 28 | ("age", "20") 29 | ] 30 | 31 | # Expected output: 32 | # DELETE FROM students WHERE name = 'John Doe' AND age = '20'; 33 | print(generate_delete_query("students", delete_conditions)) 34 | -------------------------------------------------------------------------------- /scripts/generating_query_strings/insert_query.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | def generate_insert_query(table_name: str, data: List[List[str]]) -> str: 4 | """ 5 | Generate a SQL INSERT query for a given table name and data. 6 | 7 | This function takes the name of a database table and a list of rows, 8 | where each row is a list of string values. It returns a SQL INSERT 9 | query string that can be used to insert the provided data into the 10 | specified table. 11 | 12 | Args: 13 | table_name (str): The name of the table where data will be inserted. 14 | data (List[List[str]]): A list of rows, with each row being a list of 15 | string values representing the data to be inserted. 16 | 17 | Returns: 18 | str: A SQL INSERT query string. 19 | 20 | Example: 21 | >>> generate_insert_query("users", [["John", "Doe", "john.doe@example.com"], ["Jane", "Doe", "jane.doe@example.com"]]) 22 | "INSERT INTO users VALUES ('John', 'Doe', 'john.doe@example.com'), ('Jane', 'Doe', 'jane.doe@example.com');" 23 | """ 24 | # Escaping single quotes in data and formatting rows 25 | escaped_data = [ 26 | ", ".join(f"'{str(value).replace(\"'\", \"''\")}'" for value in row) 27 | for row in data 28 | ] 29 | 30 | # Joining all rows into a single query 31 | values = "), (".join(escaped_data) 32 | query = f"INSERT INTO {table_name} VALUES ({values});" 33 | 34 | return query 35 | 36 | if __name__ == "__main__": 37 | sample_data = [ 38 | ["John", "Doe", "john.doe@example.com"], 39 | ["Jane", "Doe", "jane.doe@example.com"], 40 | ] 41 | 42 | # Expected output: 43 | # INSERT INTO users VALUES ('John', 'Doe', 'john.doe@example.com'), ('Jane', 'Doe', 'jane.doe@example.com'); 44 | print(generate_insert_query("users", sample_data)) 45 | -------------------------------------------------------------------------------- /scripts/generating_query_strings/select.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | def generate_select_query(table_name: str, columns: List[str], conditions: List[Tuple[str, str]]) -> str: 4 | """ 5 | Generate a SQL SELECT query for a given table name, columns, and conditions. 6 | 7 | Args: 8 | table_name (str): The name of the table from which to select records. 9 | columns (List[str]): A list of columns to be selected. 10 | conditions (List[Tuple[str, str]]): A list of conditions for the selection, each represented as a (column, value) tuple. 11 | 12 | Returns: 13 | str: A SQL SELECT query string. 14 | 15 | Example: 16 | >>> generate_select_query("students", ["name", "email"], [("age", "20")]) 17 | "SELECT name, email FROM students WHERE age = '20';" 18 | """ 19 | # Preparing the SELECT and WHERE parts of the query 20 | columns_clause = ", ".join(columns) 21 | where_clause = " AND ".join(f"{column} = '{value.replace(\"'\", \"''\")}'" for column, value in conditions) 22 | 23 | query = f"SELECT {columns_clause} FROM {table_name} WHERE {where_clause};" 24 | 25 | return query 26 | 27 | if __name__ == "__main__": 28 | select_columns = ["name", "email"] 29 | select_conditions = [ 30 | ("age", "20") 31 | ] 32 | 33 | # Expected output: 34 | # SELECT name, email FROM students WHERE age = '20'; 35 | print(generate_select_query("students", select_columns, select_conditions)) 36 | -------------------------------------------------------------------------------- /scripts/generating_query_strings/update_query.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple 2 | 3 | def generate_update_query(table_name: str, data: Dict[str, str], conditions: List[Tuple[str, str]]) -> str: 4 | """ 5 | Generate a SQL UPDATE query for a given table name, data, and conditions. 6 | 7 | This function constructs a SQL UPDATE query string to update specified columns 8 | with provided values in a given table, based on certain conditions. 9 | 10 | Args: 11 | table_name (str): The name of the table to update. 12 | data (Dict[str, str]): A dictionary where keys are column names and values are the new values to be set. 13 | conditions (List[Tuple[str, str]]): A list of conditions for the update, with each condition represented as a (column, value) tuple. 14 | 15 | Returns: 16 | str: A SQL UPDATE query string. 17 | 18 | Example: 19 | >>> generate_update_query( 20 | "users", 21 | {"email": "john.updated@example.com", "last_name": "UpdatedDoe"}, 22 | [("first_name", "John"), ("last_name", "Doe")] 23 | ) 24 | "UPDATE users SET email = 'john.updated@example.com', last_name = 'UpdatedDoe' WHERE first_name = 'John' AND last_name = 'Doe';" 25 | """ 26 | # Preparing the SET part of the query 27 | set_clause = ", ".join(f"{column} = '{value.replace(\"'\", \"''\")}'" for column, value in data.items()) 28 | 29 | # Preparing the WHERE part of the query 30 | where_clause = " AND ".join(f"{column} = '{value.replace(\"'\", \"''\")}'" for column, value in conditions) 31 | 32 | query = f"UPDATE {table_name} SET {set_clause} WHERE {where_clause};" 33 | 34 | return query 35 | 36 | if __name__ == "__main__": 37 | update_data = { 38 | "email": "john.updated@example.com", 39 | "last_name": "UpdatedDoe" 40 | } 41 | update_conditions = [ 42 | ("first_name", "John"), 43 | ("last_name", "Doe") 44 | ] 45 | 46 | # Expected output: 47 | # UPDATE users SET email = 'john.updated@example.com', last_name = 'UpdatedDoe' WHERE first_name = 'John' AND last_name = 'Doe'; 48 | print(generate_update_query("users", update_data, update_conditions)) 49 | -------------------------------------------------------------------------------- /scripts/setup/start_postgres.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | echo "=== 1. Updating package index ===" 5 | sudo apt-get update -y 6 | 7 | echo "=== 2. Installing Docker (if not already installed) ===" 8 | if ! command -v docker &> /dev/null; then 9 | sudo apt-get install -y docker.io 10 | else 11 | echo "Docker is already installed, skipping." 12 | fi 13 | 14 | echo "=== 3. Starting and enabling Docker service ===" 15 | sudo systemctl start docker 16 | sudo systemctl enable docker 17 | 18 | echo "=== 4. Pulling latest Postgres image ===" 19 | docker pull postgres:latest 20 | 21 | echo "=== 5. Removing any old 'postgres-local' container ===" 22 | if docker ps -a --format '{{.Names}}' | grep -Eq "^postgres-local\$"; then 23 | echo "Stopping and removing existing 'postgres-local' container..." 24 | docker rm -f postgres-local 25 | fi 26 | 27 | echo "=== 6. Running new Postgres container ===" 28 | docker run -d \ 29 | --name postgres-local \ 30 | -e POSTGRES_DB=test \ 31 | -e POSTGRES_USER=demo \ 32 | -e POSTGRES_PASSWORD=secret \ 33 | -p 127.0.0.1:5432:5432 \ 34 | postgres:latest 35 | 36 | echo "=== Done! ===" 37 | echo "You can now connect with:" 38 | echo " host=127.0.0.1 port=5432 dbname=test user=demo password=secret" 39 | --------------------------------------------------------------------------------