├── .github └── workflows │ └── redo-typo-pr.yml ├── .gitignore ├── .vscode └── settings.json ├── DISCLAIMER.md ├── NEW_README.md ├── README.md ├── abandoned └── .gitkeep ├── dd.template.md ├── docs ├── custom-slashing │ └── dd.md ├── faucets │ ├── dd.md │ └── prd.md ├── mempools │ └── dd.md └── prover-submission-cost │ └── prd.md ├── images └── node40gb.png ├── implemented └── .gitkeep ├── in-progress ├── .gitkeep ├── 0003-native-merkle-trees.md ├── 11103-forwarder-contract │ └── design.md ├── 12487-prd-fast-sync.md ├── 12639-prd-inactivity-leak.md ├── 5040-native-merkle-trees-napi.md ├── 7025-instrumenting-the-node-with-open-telemetry.md ├── 7346-batch-proving-circuits-and-l1.md ├── 7482-sequencer-prover-test-net.md ├── 7520-testnet-overview.md ├── 7588-spartan-clusters.md ├── 8077-request-response │ ├── design.md │ └── p2p-layer-overview.png ├── 8131-forced-inclusion.md ├── 8401-proof-timeliness │ ├── design.md │ ├── proof-timeliness.ipynb │ └── proving-phases.png ├── 8404-based-fallback.md ├── 8509-prover-coordination │ └── design.md ├── 8754-slashing-staking │ ├── Contract_Overview.png │ └── design.md ├── 8757-fees │ ├── design.md │ └── notebook │ │ ├── .gitignore │ │ ├── .ipynb_checkpoints │ │ └── fee-model-checkpoint.ipynb │ │ ├── .python-version │ │ ├── README.md │ │ ├── ape-config.yaml │ │ ├── blocks.pkl │ │ ├── fee-model.ipynb │ │ ├── pyproject.toml │ │ └── uv.lock ├── 9101-blob-integration │ └── design.md ├── images │ ├── 8404 │ │ ├── image_1.png │ │ ├── image_2.png │ │ ├── image_3.png │ │ └── image_4.png │ └── 7482-contracts.png ├── proving-queue │ ├── 0005-proving-queue.md │ ├── broker.png │ └── proving-arch.png └── world-state │ ├── 0004-world-state.md │ ├── append-only-tree.png │ ├── current-tree-state-structure.png │ ├── first-prune.png │ ├── historic-hash-path.png │ ├── image-structure.png │ ├── reference-counting.png │ ├── second-prune.png │ └── snapshot-tree-structure.png ├── prd.template.md ├── rejected └── .gitkeep └── scripts └── redo-typo-pr /.github/workflows/redo-typo-pr.yml: -------------------------------------------------------------------------------- 1 | name: Redo Typo PR 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | pr_number: 7 | description: "The PR number to redo" 8 | required: true 9 | type: string 10 | 11 | pull_request_target: 12 | types: [labeled] 13 | branches: 14 | - master 15 | paths-ignore: 16 | - "**/README.md" 17 | 18 | jobs: 19 | redo-typo-pr: 20 | if: github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'redo-typo-pr') 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - name: Checkout repository 25 | uses: actions/checkout@v3 26 | with: 27 | token: ${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} 28 | 29 | - name: Authenticate with GitHub CLI 30 | run: | 31 | echo "${{ secrets.AZTEC_BOT_GITHUB_TOKEN }}" | gh auth login --with-token 32 | 33 | - name: Set git configure for commits 34 | run: | 35 | # Identify ourselves, needed to commit 36 | git config --global user.name AztecBot 37 | git config --global user.email tech@aztecprotocol.com 38 | 39 | - name: Determine PR number 40 | id: determine-pr-number 41 | run: echo "PR_NUMBER=${{ github.event.inputs.pr_number || github.event.pull_request.number }}" >> $GITHUB_ENV 42 | 43 | - name: Run repo-typo-pr script 44 | run: ./scripts/redo-typo-pr ${{ env.PR_NUMBER }} 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cSpell.words": ["Gossipable"] 3 | } 4 | -------------------------------------------------------------------------------- /DISCLAIMER.md: -------------------------------------------------------------------------------- 1 | The information set out herein is for discussion purposes only and does not represent any binding indication or commitment by Aztec Labs and its employees to take any action whatsoever, including relating to the structure and/or any potential operation of the Aztec protocol or the protocol roadmap. In particular: (i) nothing in these projects, requests, or comments is intended to create any contractual or other form of legal relationship with Aztec Labs or third parties who engage with this AztecProtocol GitHub account (including, without limitation, by responding to a conversation or submitting comments) (ii) by engaging with any conversation or request, the relevant persons are consenting to Aztec Labs’ use and publication of such engagement and related information on an open-source basis (and agree that Aztec Labs will not treat such engagement and related information as confidential), and (iii) Aztec Labs is not under any duty to consider any or all engagements, and that consideration of such engagements and any decision to award grants or other rewards for any such engagement is entirely at Aztec Labs’ sole discretion. Please do not rely on any information on this account for any purpose - the development, release, and timing of any products, features, or functionality remains subject to change and is currently entirely hypothetical. Nothing on this account should be treated as an offer to sell any security or any other asset by Aztec Labs or its affiliates, and you should not rely on any content or comments for advice of any kind, including legal, investment, financial, tax, or other professional advice. 2 | -------------------------------------------------------------------------------- /NEW_README.md: -------------------------------------------------------------------------------- 1 | # Engineering Process for Product Requirements Documents and Design Documents 2 | 3 | - Owner: @just-mitch 4 | - Approvers: 5 | - Product: @joeandrews, @iAmMichaelConnor, @aminsammara, @0xrafi, @rahul-kothari 6 | - Engineering: @charlielye, @LHerskind, @dbanks12, @PhilWindle, @nventuro, @Rumata888 7 | - DevRel: @critesjosh 8 | - Target PRD Approval Date: 2025-03-14 9 | 10 | ## Key words 11 | 12 | The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC 2119](https://datatracker.ietf.org/doc/html/rfc2119). 13 | 14 | ## Introduction 15 | 16 | - "Complex work" by engineering MUST have a Design Document (DD). 17 | - Determining whether work is "complex" is the responsibility of engineering team leads and product managers. Good candidates include: 18 | - Introduces new concepts, data structures, algorithms, services, or contracts 19 | - Interface or protocol changes 20 | - Affects multiple products or users (including internal and external) 21 | - Major refactoring or restructuring of code 22 | - Sometimes, the work prescribed by a DD is large (e.g. it takes over a week of engineering hours to fully build and test), or there are several interrelated designs that all pertain to a single "chunk of work" or "epic" or "product" (e.g. sequencer selection). 23 | - In these cases, the DD SHOULD have an associated Project Requirements Document (PRD). 24 | - If the DD does not have an associated PRD, it MUST articulate the basic requirements for the work. 25 | - If the requirements for a project/product are not clear, and engineering cannot quickly work out the requirements with the product/project owner, engineering SHOULD ask for a PRD to be created. 26 | - PRDs and DDs MUST be kept up to date via pull requests. 27 | - Note, making first drafts in other platforms (e.g. Google Docs or hackmd) is encouraged for rapid iteration. 28 | - Changes to the requirements in a PRD require re-approval by the individuals or groups specified in the original PRD. 29 | - Changes to the design which affect its compliance with the PRD MUST be re-approved by the individuals or groups specified in the original PRD. 30 | 31 | ## Project Requirements Document (PRD) 32 | 33 | The purpose of a PRD is to describe **what** is being built, **why**, for **whom**, and **when** it is needed. 34 | 35 | It avoids describing **how** the work will be done, though it MAY provide guidance or opinions. 36 | 37 | In a nutshell, it should tell the reader "Users segment X wants to be able to Y1, Y2, and Y3. We know this because Z. Here are the requirements for X to get Y. We need to deliver this by D." 38 | 39 | - A PRD MUST be created as a pull request. 40 | - A PRD MUST be a single document in a new directory in the `docs` directory. 41 | - For example, if the project is called `cool-user-flow`, the PRD should be in `docs/cool-user-flow/prd.md`. 42 | - A PRD MUST be written in markdown. 43 | - A PRD MUST identify: 44 | - The project name 45 | - The person responsible for the project 46 | - Any individuals or groups that must approve the PRD. 47 | - This MUST include 48 | - someone on product 49 | - an engineering team lead 50 | - a devrel engineer if the project touches external stakeholders 51 | - This MAY include 52 | - someone from the legal team 53 | - someone from the finance team 54 | - someone from the sales team 55 | - The PRD's target approval date 56 | - The delivery deadline for the project 57 | - The main user stories the project is intended to support 58 | - Whether and how demand for those user stories has been validated 59 | - Requirements describing the desired functionality/qualities in [RFC 2119](https://datatracker.ietf.org/doc/html/rfc2119) terms 60 | - Each requirement SHOULD identify: 61 | - Why the requirement exists 62 | - Where the requirement comes from (e.g. what user is asking for it, or other observation inspired the requirement) 63 | - The KPIs and target values that candidate solutions will be measured against 64 | - Each requirement MAY: 65 | - Give guidance on how the requirement may change beyond the delivery deadline 66 | - The PRD SHOULD: 67 | - Define any key terms used in the requirements 68 | - Include a list of assumptions made in creating the requirements 69 | - Include a list of dependencies on other requirements 70 | - Include commentary on where the ideal candidate solution should sit in a tradeoff space, to give guidance how different candidate solutions will be compared. For example, stating that between two candidate solutions, the one that has the lower "cost measured in X" will be preferred. 71 | - Once all required approvals have been received, the PRD SHOULD be merged within 72 hours. 72 | 73 | ## Design Document (DD) 74 | 75 | The purpose of a DD is to describe **how** the work will be done. 76 | 77 | - A DD MUST be created as a pull request. 78 | - A DD MUST be a single document in a new directory in the `docs` directory. 79 | - For example, if the project is called `my-project`, the DD SHOULD be in `docs/my-project/dd.md`. 80 | - A DD MUST be written in markdown. 81 | - A DD MUST identify: 82 | - The PRD (including the commit hash) that the DD supports (if applicable) 83 | - A title/name for the design 84 | - The design approach 85 | - The key architecture decisions 86 | - Alternatives considered and why they were not chosen 87 | - How the design is expected to perform against the requirements in the PRD 88 | - A lead engineer for the project 89 | - A target completion date for the project 90 | - Any individuals or groups that must approve the DD. 91 | - This MUST include 92 | - at least 1 engineer on the same team as the lead engineer (MUST be the team lead if the team lead is not the lead engineer) 93 | - 1 engineer from each other team that is significantly impacted by the project 94 | - the product manager responsible for the project 95 | - This SHOULD include 96 | - individuals outside of product/engineering that approved the PRD 97 | - The DD's target approval date 98 | - A DD SHOULD identify: 99 | - assumptions/trade-offs made in creating the design 100 | - dependencies on other documents 101 | - a test plan for the project 102 | - a timeline for the project 103 | - preliminary diagrams or performance metrics as appropriate 104 | - Engineers SHOULD NOT start work on a DD until the PRD has been approved and merged. 105 | - Engineers SHOULD only do enough engineering work prior to the DD approval to allow them to write a good DD. 106 | - Once all required approvals have been received, the DD SHOULD be merged within 72 hours. 107 | 108 | ## PRD Template 109 | 110 | See [prd.template.md](prd.template.md) for a template for PRDs. 111 | 112 | ## DD Template 113 | 114 | See [dd.template.md](dd.template.md) for a template for DDs. 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Design-Driven Development Process 2 | 3 | During a sprint planning meeting, the team will discuss whether any of the issues to be worked on require a design document. 4 | 5 | Something likely qualifies for a design document if it: 6 | 7 | - Creates or changes an interface 8 | - Introduces new concepts, data structures, or algorithms 9 | 10 | If so, the owner of the issue will: 11 | 12 | 1. **Create a new design document in `./in-progress`**: 13 | - Follow the naming convention `NNNN-title.md` 14 | - `NNNN` is the issue number 15 | - `title` is a short description of the design (in `kebab-case`). 16 | 17 | 18 | 2. **Establish initial approvers**: 19 | - The first approver is your team lead. If you are a team lead, choose another approver. 20 | - If you are an approver, you can add other approvers or remove yourself as needed. 21 | - Work with your first approver to identify other approvers and stakeholders. 22 | 23 | 3. **Collaborate and iterate on the design document**: 24 | - Create and refine the first draft, then submit a PR 25 | - When you are ready, share the PR in the #engineering-designs channel in slack. 26 | - Include the design approvers, stakeholders, and the target approval date in your message 27 | - e.g.: [feature]. [target approval date]. cc @alice, @bob, @carol" 28 | - Open the thread and paste the "Executive Summary" from your design. 29 | - Iterate on the design based on the feedback and discussions 30 | 31 | 4. **Get approval**: 32 | - When approvers are satisfied with the design, they will approve the PR on github. 33 | - When all approvers have signed off, merge the PR with the design 34 | 35 | 5. **Implement the design** based on the finalized design document. 36 | - Any changes or decisions made during the implementation phase should be captured in user docs or protocol spec *and flagged in PRs*. 37 | - When the design has been implemented, create a PR to move it from `./in-progress` to `./implemented` 38 | - Include a brief explanation of changes to the original design 39 | 40 | 41 | If a design is ultimately rejected, the owner should update PR to merge the design document into the `./rejected` directory after adding a comment explaining why it was rejected. 42 | 43 | If a design is approved for implementation and then abandoned, the owner should create a PR to move it from `in-progress` to `abandoned` after adding a comment explaining why the design was abandoned. 44 | 45 | 46 | ## Approvers vs Stakeholders 47 | 48 | Stakeholders are anyone materially impacted by your change. This should be a large-ish set: PMs, DevRel, engineers across the stack, so on. Shoot for at least 5 stakeholders. 49 | 50 | Approvers are a subset intimately familiar with what you're doing who will share the responsibility for the outcomes from the design. Shoot for 2-5 approvers. 51 | 52 | ## Rejected Designs 53 | 54 | If a design is ultimately rejected, the untag the document with `design-wip`, and tag it with `design-rejected`. Include a brief explanation of why the design was rejected. 55 | 56 | Note, rejected designs are good. It saves us from implementing bad designs, and allows us to point back at why certain decisions were made. 57 | 58 | 59 | ## Zen-spiration 60 | 61 | Why do we care about designs? 62 | 63 | > "The beginning is the most important part of the work." - Plato 64 | 65 | A design means we can move faster because we all have more clarity on what we're building and how all the users interact. 66 | 67 | > "Design is not what you see, but what you make others see." - Edgar Degas 68 | 69 | Speed is paramount: it allows us to more rapidly provide value to users, and refine that value. 70 | 71 | > "Design is an iterative process. The key is to generate ideas, test them, and iterate until you find the right solution." - John Maeda 72 | 73 | Designs are solutions to problems. Problems are experienced by users. Therefore, a design-driven mindset is a user-centric mindset. 74 | 75 | > "The role of the designer is that of a good, thoughtful host anticipating the needs of his guests." - Charles Eames 76 | 77 | Users include end-users, external developers, internal developers, and even/especially components of the system itself. When we think about designing components as users, we can think about the "primitives" that we need. 78 | 79 | > "Primitives are the raw parts or the most foundational-level building blocks for software developers. They’re indivisible (if they can be functionally split into two they must) and they do one thing really well. They’re meant to be used together rather than as solutions in and of themselves. And, we’ll build them for maximum developer flexibility. We won’t put a bunch of constraints on primitives to guard against developers hurting themselves. Rather, we’ll optimize for developer freedom and innovation." - 2003 AWS Vision document 80 | 81 | Keeping yourself in a purposeful, user-centric mindset is a discipline. 82 | 83 | > "Quality is not an act, it is a habit." - Aristotle 84 | 85 | 86 | # Template 87 | 88 | | | | 89 | | -------------------- | --------------------------------- | 90 | | Issue | [title](github.com/link/to/issue) | 91 | | Owners | @you | 92 | | Approvers | @alice @bob | 93 | | Target Approval Date | YYYY-MM-DD | 94 | 95 | 96 | ## Executive Summary 97 | 98 | Provide the executive summary on your major proposed changes. 99 | 100 | ## Introduction 101 | 102 | Briefly describe the problem the work solves, and for whom. Include any relevant background information and the goals (and non-goals) of this implementation. 103 | 104 | ## Interface 105 | 106 | Who are your users, and how do they interact with this? What is the top-level interface? 107 | 108 | ## Implementation 109 | 110 | Delve into the specifics of the design. Include diagrams, code snippets, API descriptions, and database schema changes as necessary. Highlight any significant changes to the existing architecture or interfaces. 111 | 112 | Discuss any alternative or rejected solutions. 113 | 114 | ## Change Set 115 | 116 | Fill in bullets for each area that will be affected by this change. 117 | 118 | - [ ] Cryptography 119 | - [ ] Noir 120 | - [ ] Aztec.js 121 | - [ ] PXE 122 | - [ ] Aztec.nr 123 | - [ ] Enshrined L2 Contracts 124 | - [ ] Private Kernel Circuits 125 | - [ ] Sequencer 126 | - [ ] AVM 127 | - [ ] Public Kernel Circuits 128 | - [ ] Rollup Circuits 129 | - [ ] L1 Contracts 130 | - [ ] Prover 131 | - [ ] Economics 132 | - [ ] P2P Network 133 | - [ ] DevOps 134 | 135 | ## Test Plan 136 | 137 | Outline what unit and e2e tests will be written. Describe the logic they cover and any mock objects used. 138 | 139 | ## Documentation Plan 140 | 141 | Identify changes or additions to the user documentation or protocol spec. 142 | 143 | 144 | ## Rejection Reason 145 | 146 | If the design is rejected, include a brief explanation of why. 147 | 148 | ## Abandonment Reason 149 | 150 | If the design is abandoned mid-implementation, include a brief explanation of why. 151 | 152 | ## Implementation Deviations 153 | 154 | If the design is implemented, include a brief explanation of deviations to the original design. 155 | 156 | ## Disclaimer 157 | 158 | The information set out herein is for discussion purposes only and does not represent any binding indication or commitment by Aztec Labs and its employees to take any action whatsoever, including relating to the structure and/or any potential operation of the Aztec protocol or the protocol roadmap. In particular: (i) nothing in these projects, requests, or comments is intended to create any contractual or other form of legal relationship with Aztec Labs or third parties who engage with this AztecProtocol GitHub account (including, without limitation, by responding to a conversation or submitting comments) (ii) by engaging with any conversation or request, the relevant persons are consenting to Aztec Labs’ use and publication of such engagement and related information on an open-source basis (and agree that Aztec Labs will not treat such engagement and related information as confidential), and (iii) Aztec Labs is not under any duty to consider any or all engagements, and that consideration of such engagements and any decision to award grants or other rewards for any such engagement is entirely at Aztec Labs’ sole discretion. Please do not rely on any information on this account for any purpose - the development, release, and timing of any products, features, or functionality remains subject to change and is currently entirely hypothetical. Nothing on this account should be treated as an offer to sell any security or any other asset by Aztec Labs or its affiliates, and you should not rely on any content or comments for advice of any kind, including legal, investment, financial, tax, or other professional advice. 159 | -------------------------------------------------------------------------------- /abandoned/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/abandoned/.gitkeep -------------------------------------------------------------------------------- /dd.template.md: -------------------------------------------------------------------------------- 1 | # [Project Name] Design Document 2 | 3 | - Owner: 4 | - Approvers: 5 | - @[engineer 1] 6 | - @[engineer 2] 7 | - @[product manager] 8 | - @[other person on the PRD 1] 9 | - @[other person on the PRD 2] 10 | - [PRD (if applicable)](link to PRD including commit hash) 11 | - Target DD Approval Date: YYYY-MM-DD 12 | - Target Project Delivery Date: YYYY-MM-DD 13 | 14 | ## Executive Summary 15 | 16 | Summarize clearly and concisely the main design decisions, proposed changes, and their impact. 17 | 18 | If this work doesn't have a PRD, list the basic requirements for the work. 19 | 20 | ## Timeline 21 | 22 | Outline the timeline for the project. E.g. 23 | 24 | - Build component X : 2 days 25 | - Build component Y : 3 days 26 | - Build component Z : 4 days 27 | - Test : 3 days 28 | - Write docs : 2 days 29 | 30 | Total: 14 days 31 | 32 | ## Introduction 33 | 34 | Briefly describe the problem the work solves, and for whom. Include any relevant background information and the goals (and non-goals) of this implementation. 35 | 36 | ## Interface 37 | 38 | Who are your users, and how do they interact with this? What is the top-level interface? 39 | 40 | ## Implementation 41 | 42 | Delve into the specifics of the design. Include diagrams, code snippets, API descriptions, and database schema changes as necessary. Highlight any significant changes to the existing architecture or interfaces. 43 | 44 | Share any preliminary performance metrics. 45 | 46 | Discuss any alternative or rejected solutions. 47 | 48 | ## Change Set 49 | 50 | Fill in bullets for each area that will be affected by this change. 51 | 52 | - [ ] Cryptography 53 | - [ ] Noir 54 | - [ ] Aztec.js 55 | - [ ] PXE 56 | - [ ] Aztec.nr 57 | - [ ] Enshrined L2 Contracts 58 | - [ ] Sequencer 59 | - [ ] AVM 60 | - [ ] Public Kernel Circuits 61 | - [ ] Rollup Circuits 62 | - [ ] L1 Contracts 63 | - [ ] Prover 64 | - [ ] Economics 65 | - [ ] P2P Network 66 | - [ ] DevOps 67 | 68 | ## Test Plan 69 | 70 | List key test scenarios or validation steps required to confirm the design meets all project requirements. 71 | 72 | ## Documentation Plan 73 | 74 | Identify changes or additions to the user documentation or protocol spec. 75 | 76 | ## Disclaimer 77 | 78 | The information set out herein is for discussion purposes only and does not represent any binding indication or commitment by Aztec Labs and its employees to take any action whatsoever, including relating to the structure and/or any potential operation of the Aztec protocol or the protocol roadmap. In particular: (i) nothing in these projects, requests, or comments is intended to create any contractual or other form of legal relationship with Aztec Labs or third parties who engage with this AztecProtocol GitHub account (including, without limitation, by responding to a conversation or submitting comments) (ii) by engaging with any conversation or request, the relevant persons are consenting to Aztec Labs’ use and publication of such engagement and related information on an open-source basis (and agree that Aztec Labs will not treat such engagement and related information as confidential), and (iii) Aztec Labs is not under any duty to consider any or all engagements, and that consideration of such engagements and any decision to award grants or other rewards for any such engagement is entirely at Aztec Labs’ sole discretion. Please do not rely on any information on this account for any purpose - the development, release, and timing of any products, features, or functionality remains subject to change and is currently entirely hypothetical. Nothing on this account should be treated as an offer to sell any security or any other asset by Aztec Labs or its affiliates, and you should not rely on any content or comments for advice of any kind, including legal, investment, financial, tax, or other professional advice. 79 | -------------------------------------------------------------------------------- /docs/faucets/prd.md: -------------------------------------------------------------------------------- 1 | # Testnet Faucet Project Requirements Document 2 | 3 | - Owner: @just-mitch 4 | - Approvers: 5 | - @aminsammara 6 | - @rahul-kothari 7 | - @LHerskind 8 | - @signorecello 9 | - Target PRD Approval Date: 2025-03-19 10 | - Target Project Delivery Date: 2025-03-28 11 | 12 | ## Key words 13 | 14 | The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC 2119](https://datatracker.ietf.org/doc/html/rfc2119). 15 | 16 | ## Background 17 | 18 | As part of running a testing network (test-net), we (Aztec Labs) wish for external actors to: 19 | 20 | - participate in the validator set 21 | - send transactions as users would. 22 | 23 | This is to ensure that we get feedback on the stability and ease of operating a network such as ours, and to allow people to try it out. 24 | 25 | Since we require a stake to become a validator and fees to pay for transactions, we need to distribute tokens to people. 26 | 27 | Currently, we use 2 tokens: 28 | 29 | 1. the fee paying asset, which can be minted by anyone 30 | 2. the staking asset, which can be minted by its owner only 31 | 32 | This allows us to separate transaction throughput control from control of the validator set. 33 | 34 | Control of this kind is necessary in testnet because there is no economic value in the assets, thus validators have nothing at stake, and users would be free to DOS the network by flooding it with bogus transactions. 35 | 36 | By controlling the fee asset, we can: 37 | 38 | - prevent infinite minting, and thus a DOS against the potential users of the network 39 | 40 | > [!info] 41 | > Rate limiting the fee paying asset to control network congestion is not necessary or desirable, as we already have a mana limit mechanism for block construction. It is better to have users obtain a lot (but not infinite) of the fee asset, and thus use a lot of mana, so that they can perform transactions. 42 | 43 | By controlling the validator set, we can: 44 | 45 | - try to ensure that would-be validators are actually running synced nodes 46 | - set their withdrawer to an address that we control, to immediately kick if needed 47 | 48 | **Presently**: 49 | 50 | - anyone can mint whatever fee asset they want 51 | - we are one infinite mint from chaos 52 | - **we** need to **actively** mint funds and add to the validator set. 53 | 54 | ## Key Terms 55 | 56 | - Fee Asset: ERC20 that can be bridged to "Fee Juice" 57 | - Staking Asset: ERC20 that must be staked to become a validator 58 | 59 | ## Key assumptions and dependencies 60 | 61 | 1. The network can handle large influx of transactions 62 | - Phil's explorations seems to validate that this is sane 63 | 2. The network cannot defend itself, we need tight control over validators to kick misbehavior 64 | - We expect that this will change rapidly as the network matures 65 | 66 | ## Desired User Flow(s) 67 | 68 | ### The End User 69 | 70 | Active Alice wants to try out what the Aztec network has to offer. She wants to do transfers, try NFT's and try out private DeFi. 71 | 72 | Alice is happy as long as: 73 | 74 | - it is quick and easy to get hold of assets to cover her transaction fees 75 | 76 | ### The Developer 77 | 78 | Dave the developer want to try out what the Aztec network has to offer. He wish to build some cool new DeFi protocol, a fancy mechanism for escrows or just something with privacy. 79 | 80 | David is happy as long as: 81 | 82 | - it is quick and easy to get hold of assets to cover his deployment fees 83 | 84 | ### The Validator 85 | 86 | Validating Vlad wants to run a validator. 87 | 88 | Vlad is happy as long as: 89 | 90 | - he can easily start a node 91 | - easily get staking asset to run a validator on the node 92 | 93 | ## Requirements 94 | 95 | ### Functional Requirements (what the system does) 96 | 97 | #### FUNC-01 98 | 99 | Users MUST be able to run a 1-liner to get a fixed amount of fee asset on L1. 100 | 101 | Why: Else no one will be able to use the chain 102 | Where: Fallout from Alice's and Dave's user story 103 | 104 | #### FUNC-02 105 | 106 | It MUST be possible to update amount of fee assets that users receive. 107 | 108 | Why: We acknowledge that the amount we initially give out may be too high or too low, and we need to be able to adjust it. 109 | Where: Anticipation of supply/demand shocks and potential DoS scenarios 110 | 111 | #### FUNC-03 112 | 113 | Well-behaved nodes MUST use the lower value between the contract-specified mana target and their environment variable when building blocks. 114 | 115 | Why: Otherwise nodes could build blocks that would fail on L1 116 | Where: Observed deficiency in the current implementation 117 | 118 | #### FUNC-04 119 | 120 | It MUST be possible to update the mana target of the rollup. 121 | 122 | Why: Allows us to update the mana target of the rollup in response to changing conditions 123 | Where: Anticipation of supply/demand shocks and potential DoS scenarios 124 | 125 | #### FUNC-05 126 | 127 | Users MUST be able to submit an L1 address and complete a verification challenge for admission to the validator set. Producing the verification response SHOULD be a one-liner for anyone with a fully synced Aztec node. 128 | 129 | Why: Ensures at least a basic level of sybil resistance 130 | Where: Experience of validators joining the set 131 | 132 | #### FUNC-06 133 | 134 | We (Aztec Labs) MUST be able to add a user to the validator set outside of the faucet process. 135 | 136 | Why: We want to be able to add users to the validator set for testing purposes. 137 | Where: Experience of validators joining the set 138 | 139 | #### FUNC-07 140 | 141 | We (Aztec Labs) SHOULD be able to control the rate at which validators are added to the set. 142 | 143 | Why: to avoid mass validator joins. 144 | Where: Anticipation of people joining the set en masse when it is announced. 145 | 146 | ### Non-Functional Requirements (qualities the system has) 147 | 148 | #### QUAL-01 149 | 150 | The fee asset minting process SHOULD be permissionless. 151 | 152 | Why: We want to make it easy for anyone to try out the network. 153 | Where: Take Alice's story above, and assume she doesn't want to talk to anyone at Aztec Labs to get her assets. 154 | 155 | #### QUAL-02 156 | 157 | It MUST be easy for us (Aztec Labs) to remove a validator from the set that is not performing their job well. 158 | 159 | Why: Ensure the health of the network. 160 | Where: Experience of validators joining the set. 161 | 162 | #### QUAL-03 163 | 164 | It SHOULD NOT be trivial for people to create an arbitrary number of validators. 165 | 166 | Why: to maintain a diverse, performant validator set. 167 | Where: Anticipation of DOS attacks 168 | 169 | ### Performance Requirements 170 | 171 | #### PERF-01 172 | 173 | Users requesting fee asset SHOULD NOT need to wait more than 1 minute. 174 | 175 | Why: This is one of the first actions users trying out the network should perform. It should be snappy. 176 | Where: Experience of users bailing on anything mildly inconvenient. 177 | 178 | #### PERF-02 179 | 180 | Would-be validators SHOULD NOT need to wait more than 30 minutes to get added to the validator set (not necessarily the committee). The process SHOULD require under 5 minutes of active participation - ideally allowing validators to initiate the request and later return to find their node participating in the set. 181 | 182 | Why: There may still be some manual intervention to add validators to the set, but it should still be time-bound and quick. 183 | Where: Experience adding external validators to the set. 184 | 185 | ## Tradeoff Analysis 186 | 187 | We recommend designs that are easy to implement and give us some protection, rather than convoluted designs that are bulletproof, considering this is for testnet only. 188 | 189 | ## Disclaimer 190 | 191 | The information set out herein is for discussion purposes only and does not represent any binding indication or commitment by Aztec Labs and its employees to take any action whatsoever, including relating to the structure and/or any potential operation of the Aztec protocol or the protocol roadmap. In particular: (i) nothing in these projects, requests, or comments is intended to create any contractual or other form of legal relationship with Aztec Labs or third parties who engage with this AztecProtocol GitHub account (including, without limitation, by responding to a conversation or submitting comments) (ii) by engaging with any conversation or request, the relevant persons are consenting to Aztec Labs’ use and publication of such engagement and related information on an open-source basis (and agree that Aztec Labs will not treat such engagement and related information as confidential), and (iii) Aztec Labs is not under any duty to consider any or all engagements, and that consideration of such engagements and any decision to award grants or other rewards for any such engagement is entirely at Aztec Labs’ sole discretion. Please do not rely on any information on this account for any purpose - the development, release, and timing of any products, features, or functionality remains subject to change and is currently entirely hypothetical. Nothing on this account should be treated as an offer to sell any security or any other asset by Aztec Labs or its affiliates, and you should not rely on any content or comments for advice of any kind, including legal, investment, financial, tax, or other professional advice. 192 | -------------------------------------------------------------------------------- /docs/mempools/dd.md: -------------------------------------------------------------------------------- 1 | # Limiting Client Mempools Design Document 2 | 3 | Our client tx mempool today grows unbounded. We need to place a limit on how many txs it absorbs. 4 | 5 | This addresses how to limit the total number of valid txs. It does **not** address how to protect against DOS attacks that cause the client to validate more txs than it can process. 6 | 7 | ## Tx validity 8 | 9 | A tx in Aztec requires the following checks to be valid. Static checks can be done just once, dynamic ones are done when the tx is received and again when the sequencer is building a block. If the tx fails validation during block building, it gets evicted, unless it failed due to gas fees lower than the block base fee. 10 | 11 | ### Static 12 | 13 | - L1 chain id and L2 version 14 | - Setup function 15 | - Correct public execution requests and logs 16 | - Minimum gas fees (base and priority) 17 | - Reasonable gas limit (currently missing) 18 | - Valid ClientIVC proof 19 | 20 | ### Dynamic 21 | 22 | - Max block number for inclusion 23 | - Double spend (repeated nullifiers) 24 | - Archive root exists (can become invalid after a reorg) 25 | - Fee juice balance for fee payer 26 | - Gas fees over current block base fee 27 | - Gas limit below current block limit 28 | 29 | ## Geth 30 | 31 | What does Geth do? Following is based on geth's [legacy (non-blob) pool implementation](https://github.com/ethereum/go-ethereum/blob/master/core/txpool/legacypool/legacypool.go) 32 | 33 | - Geth runs checks by looping over its mempool at regular intervals. This includes evicting txs, and promoting/demoting txs between executable and non-executable. 34 | - A tx is considered "executable" if it has non nonce gaps and sender has enough balance to pay for its gas. 35 | - Txs are evicted based on mempool capacity, based on time, and based on the current sender balance (vs the tx max cost) and current max block size in gas (vs tx gas limit). 36 | - Geth enqueues txs per account (sender), and splits txs into executable and non-executable. Geth defines the concept of "slots", where each tx takes up a number of slots depending on its size in bytes. Defaults: 37 | - `AccountSlots` Number of executable transaction slots guaranteed per account: 16 38 | - `GlobalSlots` Maximum number of executable transaction slots for all accounts: 4096 + 1024 39 | - `AccountQueue` Maximum number of non-executable transaction slots permitted per account: 64 40 | - `GlobalQueue` Maximum number of non-executable transaction slots for all accounts: 1024 41 | - `Lifetime` Maximum amount of time non-executable transaction are queued: 3 hours 42 | - Geth checks that txs have a minimum gas price before being accepted. For replacements (ie two txs with same sender and nonce), it checks that price bumps are at least of a given %. 43 | - `PriceLimit` Minimum gas price to enforce for acceptance into the pool: 1 44 | - `PriceBump` Minimum price bump percentage to replace an already existing transaction by nonce: 10% 45 | - When adding a new tx to the pool, after static validations, geth enqueues the tx as non-executable, and waits for the loop to promote it. 46 | - [If the tx pool is full](https://github.com/ethereum/go-ethereum/blob/80b8d7a13c20254a9cfb9f7cbca1ab00aa6a3b50/core/txpool/legacypool/legacypool.go#L691-L692), it discards cheaper txs based on gas tip (ie priority fees). Only the global slots seem to be considered here. 47 | - When cleaning up the pool in a loop, loops over pending (executable?) txs for every sender that has gone over AccountSlots, and drops txs (based on nonce) form them. Also loops over future (non-executable?) txs based on **heartbeats**: accounts with the most time without any activity get their txs pruned first. 48 | 49 | ## Difficulties 50 | 51 | In addition to all complications that Ethereum has, we also have the issue that a tx public execution can invalidate an arbitrary number of existing txs just by emitting nullifiers. We have no way of knowing that in advance. 52 | 53 | Also, while for Ethereum a "replacement" is just a tx with the same nonce and sender as an existing one, for us any tx that shares a nullifier can technically be a replacement. This also means that tx A may be a replacement for B and C, but B and C may be unrelated to each other. 54 | 55 | Also, while our `fee_payer` slightly matches Ethereum's `sender`, it's possible that many users (if not all) use the same very few fee payers (in our case, FPCs), so there is likely no point in setting limits per sender as Ethereum does. Remember that, thanks to privacy, we cannot know the sender of a tx. On the flip side, we know that two txs do come from the same user if they share a private-land nullifier. 56 | 57 | ## Design 58 | 59 | To recap, we need to consider: 60 | 61 | - Balance of fee-payers 62 | - Conflicting nullifiers 63 | - Max block number 64 | - Gas fees and limit vs current base fees and limits 65 | - Archive root (only on reorgs) 66 | 67 | We propose keeping the following indices for all txs. These indices are implemented as mappings from the given keys to the tx identifier in the backing LMDB store: 68 | 69 | - priority fee 70 | - fee-payer 71 | - nullifiers (indexes a tx by all of its nullifiers) 72 | - base fee 73 | - gas limit 74 | - max block number 75 | 76 | When adding a tx, we first run the trivial checks: 77 | 78 | - Correct L1 chain id and L2 version 79 | - Public setup function is acceptable 80 | - Correct public execution requests and logs 81 | - Gas fees (base and priority) above a given minimum 82 | - Valid ClientIVC proof 83 | - Max block number for inclusion is in the future 84 | - Double spend (repeated nullifiers) against existing state 85 | - Gas limit is below the current block gas limit 86 | - Archive root exists 87 | 88 | And then: 89 | 90 | - We check if the current balance of the fee payer, minus the max cost of all pending txs for that fee payer, is enough to pay for this tx. If it is not, we try evicting other txs with a lower priority fee. If that works, and all other checks pass, we include the tx dropping the others. 91 | - We check if it shares a nullifier with any existing pending tx (we already checked duplicates against current state at this point). If it pays more than all of the conflicting ones, and it passes all other checks, we include it and drop the other ones. 92 | - We check if the tx fees are above the current base fees. If not, we drop it. Note that we could save it for later in case fees drop in the future, but this means tracking two different pools (executable and non-executable, as geth does). 93 | - We check if we are below a configurable size/number of pending txs. If we are not, start dropping txs with lower priority fee (sorted by priority fee) until we get again below the threshold. 94 | - If we do add the tx, we index its max block number as the minimum of the tx's max-block-number and the current block number plus a configurable number. This allows us to evict txs after they'd been sitting in the pool for a very long time. 95 | 96 | When a new block is mined: 97 | 98 | - We drop all txs that share nullifiers with nullifiers from the mined blocks 99 | - We update the balance of fee payers and drop txs that can no longer be paid 100 | - We drop all txs with a computed max-block-number equal or lower than the mined one 101 | 102 | Note that we should not be dropping them, but rather pushing them to the side to reincorporate them in case of a reorg. But we will dismiss this for now. 103 | 104 | When a reorg happens, we crawl through all txs and evict the ones with a no-longer-valid archive root. We could also do this via an index, depending on how frequent we think reorgs will be. 105 | 106 | When building a block, for each tx we pick up: 107 | 108 | - We re-check nullifiers since public execution of previous txs in the block could invalidate the current one. If we fail validation, we do not drop the tx from the pool immediately; instead, we wait for the block to be mined, and for the p2p sync to evict the tx. 109 | - Note that, if we check duplicates against existing nullifiers on every block we add, we only need to check against nullifiers emitted during the block being built. 110 | - We check gas fees and limits against the current block base gas fees and limits. If we fail, we just skip the tx. 111 | 112 | ## Alternative approaches 113 | 114 | We can rely heavily on the fact that spamming txs in Aztec is expensive due to the ClientIVC proofs, keep only a global limit on the total number/size of txs, and simply evict based on total mempool size using priority fees, plus re-checking on every block mined. This is much easier to implement than the above. 115 | 116 | An attacker can still spam txs with a shared set of nullifiers to flood the pool with just their txs, but if the priority fee is high enough (if it's too low, the attacker's txs get replaced by other txs), one of those txs will be picked up soon enough and invalidate the others; assuming we filter out the invalid ones fast enough, the sequencer eventually get to other valid txs in the pool. The main assumption is that an attacker cannot produce client proofs at a pace that lets them completely fill the mempool before the next block gets built. 117 | 118 | This approach still requires rejecting txs with an ineligible base fee or too large a gas limit, otherwise the attacker could flood the tx with non-executable txs. It also requires reviewing all txs on the mempool whenever a block is mined to drop them based on shared nullifiers, insufficient balance, or max-block-age. 119 | -------------------------------------------------------------------------------- /docs/prover-submission-cost/prd.md: -------------------------------------------------------------------------------- 1 | # PRD: Prover Submission Cost 2 | 3 | - Owner: @LHerskind 4 | - Approvers: 5 | - Product: @joeandrews, @aminsammara 6 | - Engineering: @just-mitch, @Maddiaa0 7 | - DevRel: 8 | - Target PRD Approval Date: 2025-03-21 9 | - Target Delivery Deadline: 2025-04-07 10 | 11 | > [!NOTE] 12 | > **Keywords** 13 | > The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC 2119](https://datatracker.ietf.org/doc/html/rfc2119). 14 | 15 | # Background 16 | 17 | > [!NOTE] 18 | > **Key Terms** 19 | > - **Multi-Proofs**: Scheme that allows redundancy in proof submission and shares payment between all submitters. 20 | > - **Mana**: The unit of work on the Aztec rollup 21 | > - **BaseFee**: The amount of fee asset to pay per _Mana_ to cover costs 22 | > - **Congestion Multiplier**: A multiplier on top of the _BaseFee_ that depends on the usage of the chain to limit spam etc. 23 | 24 | The act of producing blocks incurs a cost to the block builders: 25 | 26 | - The sequencer pays for publishing the block to the data availability layer and to perform some validity checks such as validating attestations. 27 | - The prover pays for producing the proof (proving) and then to publish and verify the proof to the base-layer. 28 | 29 | We build the background on assuming that you are familiar with the following past design docs: 30 | 31 | - Prover Coordination Design([cdfb3a72](https://github.com/AztecProtocol/engineering-designs/blob/cdfb3a72e9b3e4415dcbfe04bd92878996472e6d/in-progress/8509-prover-coordination/design.md)) 32 | - Fee Design Doc ([dac7fdfb](https://github.com/AztecProtocol/engineering-designs/blob/dac7fdfbffb0b0d10ce0dff85221f7a6ece1933b/in-progress/8757-fees/design.md)) 33 | 34 | The _BaseFee_ was designed to cover: 35 | 36 | 1. the cost to prove the transactions in the block 37 | 2. the cost to submit the block 38 | 3. `1/N` cost to submit an epoch proof (`N` being epoch size) 39 | 40 | The BaseFee calculation originally assumed sufficient mana utilization per block to meet targets. Therefore, blocks with low utilization would not fully cover operational costs through fees alone. To address this, block rewards were introduced as a subsidy. 41 | 42 | Provers were compensated with a share of both fees and rewards, allocated according to their provided quotes. 43 | 44 | Then the move to [Prover Coordination: Multiproofs](https://hackmd.io/Ivn9axP1SFyEHjpAXVn62g) happened and the costs morphed into the following (`M` is number of proofs): 45 | 46 | 1. the cost to prove the transactions in the block `M` times 47 | 2. the cost to submit the block 48 | 3. `M/N` cost to submit an epoch proof 49 | 50 | However, our model is currently flawed as it is: 51 | 52 | - not taking `M` proofs into account when computing the BaseFee 53 | - paying the proof submission cost to the sequencer, not the prover 54 | 55 | These flaws are remnants of the quote-based prover coordination. 56 | 57 | ## Assumptions 58 | 59 | - There exists at least `M` provers that are willing to participate in block production. 60 | - If the true number is less than `M` we are overpaying for proofs. 61 | - The gas estimates of the proof submission cost is somewhat precise 62 | - Even though it happens potentially many blocks before the proof submission itself. 63 | - We expect submission cost to be greater than proving cost. 64 | 65 | # User Stories 66 | 67 | ## End User 68 | 69 | As an end user, I want transaction fees to be predictable, transparent, and low, similar to using a single-sequencer system, so that I don’t face unexpectedly high costs. 70 | 71 | This is especially important during early network phases with low usage, when high fees might discourage me from continuing to use the network. 72 | 73 | > [!WARNING] 74 | > **Multi-proof overhead** 75 | > If the costs associated with submitting multiple proofs (M) are passed directly to users via the BaseFee, it may lead to significantly higher fees compared to centralized or single-proof setups. The additional application of the Congestion Multiplier on these inflated fees could further compound this issue, potentially causing users to abandon the network early on. 76 | 77 | ## Prover 78 | 79 | As an economically rational prover, I wish to participate in block production on the Aztec network, such that I can earn some money on all the machines that I have collected. 80 | 81 | In short, as a prover, I wish to: 82 | 83 | 1. Collect blocks and transactions as the chain grows 84 | 2. Run machines to prove transactions and roll them into an epoch proof 85 | 3. Submit epoch proofs to the base-layer for a share of the rewards 86 | 4. Profit??? 87 | 88 | > [!WARNING] 89 | > **Multi-proof commentary** 90 | > With the nature of the multi-proofs splitting rewards between all submitters and only paying out after. We won't know ahead of time if there is profit or not. 91 | 92 | # Requirements 93 | 94 | ## Functional Requirements 95 | 96 | ### Earmarked Fees 97 | 98 | - **What**: Fees earmarked for specific actions (e.g., proof submission) **MUST** be directed to the entity incurring the associated cost. 99 | - **Why**: To maintain fairness and economic incentive alignment between actors. 100 | - **Where**: Derived from the current imbalance (sequencer receives funds without bearing costs). 101 | 102 | ### Consistent Proving 103 | 104 | - **What**: Actors that consistently produce proofs **SHOULD** receive bigger share of the fees that inconsistent actors 105 | - **Why**: To maintain consistency in block production and provide some minimum quality of service 106 | - **Where**: Derived from concerns about potential abandonment during periods of low network utilization or initial adoption phases. 107 | 108 | ### Protection from Excessive Multi-Proof Costs 109 | 110 | - **What**: Transaction fees charged to users **SHOULD** increase at most sub-linearly due to the introduction of multi-proof submissions compared to a single-proof sequencer setup. 111 | - **Why**: To prevent users from experiencing unexpectedly high fees resulting from internal design decisions (such as multiple proofs), ensuring network adoption and retention. 112 | - **Where**: Derived from concerns about potential abandonment during periods of low network utilization or initial adoption phases. 113 | 114 | ## Non-functional Requirements 115 | 116 | ### Prover profitability 117 | 118 | - **What**: Proving **SHOULD** be profitable in the presence of `<=M` submitters, even during periods of low activity. 119 | - **Why**: To ensure chain-growth and avoid avoid continuous pruning. 120 | - **Where**: Derived from other requirements on stable block production and looking at other chains and general economics. 121 | 122 | ### Scalability 123 | 124 | - **What**: In the presence of `<=M` submitters their profit **SHOULD** scale with increasing transaction volume and overall network usage. 125 | - **Why**: To avoid creating incentives for provers and sequencers to collude, deliberately limiting network throughput to maximize their individual profits at the expense of network growth. 126 | - **Where**: Derived from concerns regarding potential economic collusion. 127 | 128 | ### Inflation Bounds 129 | 130 | - **What**: The inflation of the staking asset **MUST** be less than 20% yearly 131 | - **Why**: Infinite inflation is not scalable. 132 | - **Where**: Common economic sends to limit inflation + @aminsammara for number. 133 | 134 | # Handling Tradeoffs 135 | 136 | When handling the tradeoffs, I believe we should step in the direction of "subsidise too much". 137 | 138 | For ignition there will be **no fees** since there are no transactions. If the provers cannot recoup their costs from the block rewards it is very limited who would be able **and** willing to participate. 139 | 140 | For alpha, it will also allow us to keep transaction fees low(er) while usage is low, making it easier for users to try the network. 141 | -------------------------------------------------------------------------------- /images/node40gb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/images/node40gb.png -------------------------------------------------------------------------------- /implemented/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/implemented/.gitkeep -------------------------------------------------------------------------------- /in-progress/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/.gitkeep -------------------------------------------------------------------------------- /in-progress/0003-native-merkle-trees.md: -------------------------------------------------------------------------------- 1 | | | | 2 | | -------------------- | --------------------------------------------------------------------------------------------- | 3 | | Owners | @PhilWindle | 4 | | Approvers | @just-mitch @alexghr | 5 | | Target Approval Date | 2024-06-21 | 6 | 7 | 8 | ## Executive Summary 9 | 10 | This design attempts to solve the problem of slow sync and merkle tree insertion performance. 11 | 12 | 13 | ## Introduction 14 | 15 | We require high performance merkle tree implementations both to ensure nodes can stay synched to the network and sequencers/provers can advance the state as required to build blocks. Our cuirrent TS implementations are limited in their single-threaded nature and the unavoidable constraint of have to repeatedly call into WASM to perform a hash operation. 16 | 17 | Some analysis of the quantity of hashing and the time required can be found [here](https://hackmd.io/@aztec-network/HyfTK9U5a?type=view). 18 | 19 | This design proposes the creation of a set of multi-threaded merkle tree implementations in C++ using LMDB. It builds upon some previous prototyping to develop concurrent indexed tree insertions. 20 | 21 | ## Implementation 22 | 23 | There are many parts to this design, we will walk through them individiually and discuss the choices made at each stage. 24 | 25 | ### Overall Architecture 26 | 27 | A new C++ binary, World State, will be created that will be started by the node software. It will be configured with the location in which Merkle Tree data should be stored. It will then accept and respond with msgpack-ed messages over one or more streams. The initial implementation will simply used stdio, but this will be absrtacted such that this could be replaced by other stream-based mechanisms. 28 | 29 | To interface with the World State, an abstraction will be created at the `MerkleTreeDb` level. This accurately models the scope of functionality provided by the binary as owner of all the trees. It was considered that the abstraction could sit at the level of individual trees, but this creates difficulty whan we want to send an entire block to the World State to be inserted. This is an important use case as synching entire blocks is where signifcant performance optimisations can be made. 30 | 31 | 32 | ``` TS 33 | export type MerkleTreeDb = { 34 | [Property in keyof MerkleTreeOperations as Exclude]: WithIncludeUncommitted< 35 | MerkleTreeOperations[Property] 36 | >; 37 | } & Pick & { 38 | /** 39 | * Returns a snapshot of the current state of the trees. 40 | * @param block - The block number to take the snapshot at. 41 | */ 42 | getSnapshot(block: number): Promise; 43 | }; 44 | ``` 45 | 46 | An abstract factory will then be created to construct the appropriate concrete type based on whether an instance of the node has native World State or not. 47 | 48 | ### Interface 49 | 50 | The interface will be an asynchronous message based communication protocol. Each message is provided with meta data uniquely identiying it and is responded to inidividually. It is not necessary to wait for a response to a message before sending a subsequent message. A simple message specification will be created, some examples of which are shown here: 51 | 52 | ``` C++ 53 | enum WorldStateMsgTypes { 54 | START_TREE_REQUEST = FIRST_APP_MSG_TYPE, 55 | START_TREE_RESPONSE, 56 | GET_TREE_INFO_REQUEST, 57 | GET_TREE_INFO_RESPONSE, 58 | INSERT_LEAVES_REQUEST, 59 | INSERT_LEAVES_RESPONSE, 60 | }; 61 | 62 | struct MsgHeader { 63 | uint32_t messageId; // Unique Id for the message 64 | uint32_t requestId; // Id of the message this is responding too (may not be used) 65 | 66 | MSGPACK_FIELDS(messageId, requestId); 67 | 68 | MsgHeader() = default; 69 | 70 | MsgHeader(uint32_t reqId) 71 | : requestId(reqId) 72 | {} 73 | 74 | MsgHeader(uint32_t msgId, uint32_t reqId) 75 | : messageId(msgId) 76 | , requestId(reqId) 77 | {} 78 | }; 79 | 80 | struct GetTreeInfoRequest { 81 | std::string name; 82 | 83 | MSGPACK_FIELDS(name); 84 | }; 85 | 86 | struct GetTreeInfoResponse { 87 | std::string name; 88 | uint32_t depth; 89 | bb::fr root; 90 | uint64_t size; 91 | bool success; 92 | std::string message; 93 | 94 | MSGPACK_FIELDS(name, depth, root, size, success, message); 95 | }; 96 | 97 | template struct TypedMessage { 98 | uint32_t msgType; 99 | MsgHeader header; 100 | T value; 101 | 102 | TypedMessage(uint32_t type, MsgHeader& hdr, const T& val) 103 | : msgType(type) 104 | , header(hdr) 105 | , value(val) 106 | {} 107 | 108 | TypedMessage() = default; 109 | 110 | MSGPACK_FIELDS(msgType, header, value); 111 | }; 112 | ``` 113 | 114 | ``` TS 115 | export type GetTreeInfoRequest = { 116 | name: string; 117 | } 118 | 119 | export type GetTreeInfoResponse = { 120 | name: string; 121 | depth: number; 122 | success: boolean; 123 | message: string; 124 | root: Buffer; 125 | size: bigint; 126 | } 127 | ``` 128 | 129 | ### LMDB 130 | 131 | LMDB is a high performance key-value database allowing for concurrent read/write access and fully ACID transactions. In particular, it supports up to 126 concurrent read transactions. Write transactions can be performed concurrently with reads but we won't use this. The majority of our World State operations only require read access to persisted data. 132 | 133 | There are 3 broad categories of World State operations: 134 | 135 | #### Reads 136 | 137 | Simply reading data from the trees is performed using either `committed` or `uncommitted` state. Committed state is that which has fully settled and is therefore not going to change over the course of building a block. It can only change upon settlement of a new block. Uncommitted reads will read from the pending state, it is not recommended that uncommitted reads are performed by anyone other than a sequencer/prover. 138 | 139 | Examples of reads are requesting sibling paths, state roots etc. 140 | 141 | #### Updates 142 | 143 | As a sequencer/prover inserts transaction side-effects, the resulting new state is computed and cached in memory. This allows for the seperation of `committed` and `uncommitted` reads and the easy rolling back of unsuccessful blocks. 144 | 145 | #### Commits 146 | 147 | When a block settles, the node performs a commit. It verifies any uncommitted state it may have against that published on chain to determine if that state is canonical. If it is not, the `uncommitted` state is dicarded and the node perform an `Update` operation using the newly published side effects. 148 | 149 | Once the node has the correct `uncommitted` state, it commits that state to disk. This is the only time that a write transaction is required against the database. 150 | 151 | ### Updating the World State 152 | 153 | The `Update` operation involves inserting side-effects into one or more trees. Depending on the type of tree, we can make significant optimisations to reduce the real-world time taken. 154 | 155 | #### Append Only 156 | 157 | Append only trees don't support the updating of any leaves. New leaves are inserted at the right-most location and nodes above these are updated to reflect their newly hashed values. Optimisation here is simply a case of dividing the set of leaves into smaller batches and hashing each of these batches into a sub-tree in seperate threads. Finally, the roots are used to build the sub-tree on top before hashing to the root of the main tree. 158 | 159 | #### Indexed Tree 160 | 161 | Indexed Trees require significantly more hashing than append only trees. In fact, adding a set of leaves to an Indexed Tree finishes with an append only tree insertion of the new leaves. However, before this, it is necessary to update all 'low-value' leaves first. 162 | 163 | For each leaf being inserted: 164 | 165 | 1. Identify the location of the leaf whose value immediately precedes that being inserted. 166 | 2. Retrieve the sibling path of the preceeding leaf before any modification. 167 | 3. Set the 'next' value and index to point to the leaf being inserted. 168 | 4. Set the 'next' value and index of the leaf being inserted to the leaf previously pointed to by the leaf just updated. 169 | 5. Re-hash the updated leaf and update the leaf with this hash, requiring the tree to be re-hashed up to the root. 170 | 171 | Unfortunately, this process is very sequential with minimal opportunity for concurrent hashing. Each sibling path must be taken after having updated the 'low leaf' for the previous insertion. We can achieve a reasonable degree of concurrency here though. We first identify all of the 'low-leaf' values that need updating, then for each we merge steps 2 and 5, making a single pass up the tree, retrieving each node's value before overwriting it. We can then schedule each of these tree traversals as a unit of work to be carried out on a thread pool. 172 | 173 | For example, we have a depth 3 Indexed Tree and 2 leaves to insert. The first requires leaf at index 0 to be updated, the second requires leaf at index 1 to be updated. 174 | 175 | 1. Thread 1 reads the current leaf at level 2 (the leaf level), index 0 to populate it's sibling path, then writes the new leaf value. 176 | 2. Thread 1 reads the sibling at level 2, index 1, writes the new hash into level 1, index 0 (the parent node). 177 | 3. Thread 1 signals that it has finished with level 2. 178 | 4. Thread 2, having waited for the signal to indicate level 2 is clear can now start it's traversal of the tree, performing the same procedure. 179 | 180 | In the above example, Thread 2 will follow Thread 1 up the tree, providing a degree of concurrency to the update operation. Obviously, this example if limited, in a 40 depth tree it is possible to have many threads working concurrently to build the new state without collision. 181 | 182 | In this concurrent model, each thread would use it's own single read transaction to retrieve `committed` state and all new `uncommitted` state is written to the cache in a lock free manner as every thread is writing to a different level of the tree. 183 | 184 | ## Change Set 185 | 186 | Fill in bullets for each area that will be affected by this change. 187 | 188 | - [ ] L1 Contracts 189 | - [ ] Enshrined L2 Contracts 190 | - [ ] Private Kernel Circuits 191 | - [ ] Public Kernel Circuits 192 | - [ ] Rollup Circuits 193 | - [ ] Aztec.nr 194 | - [ ] Noir 195 | - [ ] AVM 196 | - [x] Sequencer 197 | - [ ] Fees 198 | - [ ] P2P Network 199 | - [ ] Cryptography 200 | - [ ] DevOps 201 | 202 | ## Test Plan 203 | 204 | As the World State is used heavily in all operations, we will gain confidence through the use of: 205 | 206 | 1. Unit tests within the C++ section of the repo. 207 | 2. Further sets of unit tests in TS, comparing the output of the native trees to that of the TS trees. 208 | 3. All end to end tests will inherently test the operation of the World State. 209 | 210 | ## Prototypes 211 | 212 | Areas of this work have been prototyped already. The latest being [here](https://github.com/AztecProtocol/aztec-packages/pull/7037). -------------------------------------------------------------------------------- /in-progress/11103-forwarder-contract/design.md: -------------------------------------------------------------------------------- 1 | # Forwarder Contract 2 | 3 | | | | 4 | | -------------------- | ---------------------------------------------------------------------------------- | 5 | | Issue | [Forwarder Contract](https://github.com/AztecProtocol/aztec-packages/issues/11103) | 6 | | Owners | @just-mitch | 7 | | Approvers | @LHerskind @PhilWindle @spalladino @spypsy | 8 | | Target Approval Date | 2025-01-15 | 9 | 10 | ## Executive Summary 11 | 12 | Add a forwarder contract that allows the sequencer client to take multiple actions in the same L1 transaction. 13 | 14 | Adjust the sequencer client to batch its actions into a single L1 transaction. 15 | 16 | ## Introduction 17 | 18 | Within the same L1 block, one cannot make blob transactions and regular transactions from the same address. 19 | 20 | However, aztec node operators must be able to: 21 | 22 | - propose an l2 block 23 | - vote in the governance proposer contract 24 | 25 | in the same L1 block. 26 | 27 | ### Goals 28 | 29 | - Allow the sequencer client to take multiple actions in the same L1 transaction 30 | - No changes to governance/staking 31 | - Under 10 gas overhead per L2 transaction 32 | 33 | ### Non-goals 34 | 35 | - Support multiple actions for the prover node 36 | 37 | ## Interface 38 | 39 | Node operators will need to deploy a forwarder contract. 40 | 41 | When an attester deposits into the staking contract, the forwarder contract of the node operator will be specified as the proposer. 42 | 43 | The Aztec Labs sequencer client implementation will need to be updated to use the forwarder contract; this involves refactoring `yarn-project/sequencer-client/src/publisher/l1-publisher.ts`. 44 | 45 | ## Implementation 46 | 47 | ### Forwarder Contract 48 | 49 | It is straightforward. 50 | 51 | ```solidity 52 | contract Forwarder is Ownable, IForwarder { 53 | using Address for address; 54 | 55 | constructor(address __owner) Ownable(__owner) {} 56 | 57 | function forward(address[] calldata _to, bytes[] calldata _data) 58 | external 59 | override(IForwarder) 60 | onlyOwner 61 | { 62 | require( 63 | _to.length == _data.length, IForwarder.ForwarderLengthMismatch(_to.length, _data.length) 64 | ); 65 | for (uint256 i = 0; i < _to.length; i++) { 66 | _to[i].functionCall(_data[i]); 67 | } 68 | } 69 | } 70 | 71 | ``` 72 | 73 | Note: this requires all the actions to succeed, so the sender must be sure that, e.g. a failed governance vote will not prevent the L2 block from being proposed. 74 | 75 | Note: this implementation is not technically part of the protocol, and as such will live in `l1-contracts/src/periphery`. 76 | 77 | ### Refactoring L1 Publisher 78 | 79 | L1 publisher will be broken into two classes: 80 | 81 | - within `@aztec/sequencer-client`, there will be a `SequencerPublisher` 82 | - within `@aztec/prover-node`, there will be a `ProverNodePublisher` 83 | 84 | Under the hood, both of these will use the `L1TxUtils` to create and send L1 transactions. 85 | 86 | The publisher had also had responsibilities as a "getter" of different information on L1. This will be refactored into classes specific to the individual contracts that are being queried, e.g. `yarn-project/ethereum/src/contracts/rollup.ts` has a `Rollup` class that is responsible for getting information from the rollup contract. 87 | 88 | ### `ProverNodePublisher` 89 | 90 | The `ProverNode` will have a `ProverNodePublisher` that has the functions currently within `l1-publisher.ts` that are related to the prover node, and have the same interface/semantics as the current `L1Publisher`. As an aside, this means `@aztec/prover-node` should no longer have a dependency on the `@aztec/sequencer-client` package. 91 | 92 | In essence, this class is an API for L1 transactions for the prover node, and a simple wrapper around the `L1TxUtils` class. 93 | 94 | ### `SequencerPublisher` 95 | 96 | The `SequencerClient` will have a `SequencerPublisher` that has many of the same functions currently within the `l1-publisher.ts`, but will have different semantics. 97 | 98 | The `SequencerPublisher` will have: 99 | 100 | - `requests: RequestWithExpiry[]` 101 | - knowledge of the sequencer's forwarder contract 102 | 103 | where: 104 | 105 | ```typescript 106 | type Action = "propose" | "claim" | "governance-vote" | "slashing-vote"; 107 | interface RequestWithExpiry { 108 | action: Action; 109 | request: L1TxRequest; 110 | // the last L2 slot that the request is valid for. 111 | // requests will be dropped if the sequencer is already past this slot. 112 | lastValidL2Slot: bigint; 113 | gasConfig?: L1GasConfig; 114 | blobConfig?: L1BlobInputs; 115 | onResult?: ( 116 | request: L1TxRequest, 117 | result?: { 118 | receipt: TransactionReceipt; 119 | gasPrice: GasPrice; 120 | stats?: TransactionStats; 121 | errorMsg?: string; 122 | } 123 | ) => void; 124 | } 125 | ``` 126 | 127 | The `Sequencer` will append to the `requests` list whenever it wants to: 128 | 129 | - propose an l2 block 130 | - cast a governance proposal vote 131 | - cast a slashing vote 132 | 133 | At end of every iteration of the Sequencer's work loop, it will await a call to `SequencerPublisher.sendRequests()`, which will send the queued requests to the forwarder contract, and flush the `requests` list. 134 | 135 | ### Cancellation/Resend 136 | 137 | A complication is that ethereum nodes make replacement of blob transactions expensive, and cancelation impossible, as they operate under the assumption that rollups seldom/never need to replace/cancel blob transactions. 138 | 139 | See [geth's blob pool](https://github.com/ethereum/go-ethereum/blob/581e2140f22566655aa8fb2d1e9a6c4a740d3be1/core/txpool/blobpool/blobpool.go) for details/constraints. 140 | 141 | This is not true for Aztec's decentralized sequencer set with strict L1 timeliness requirements on L2 blocks. 142 | 143 | So a concern is the following scenario: 144 | 145 | - proposer A submits a tx with nonce 1 (with a blob) that is not priced aggressively enough 146 | - Tx1 sits in the blob pool, but is not included in an L1 block 147 | - proposer A tries to submit another transaction, but needs to know to use Tx2 148 | - Tx1 needs to be replaced with a higher fee, but it will revert if the network is in a different L2 slot and the bundle contained a proposal 149 | 150 | This is addressed by: 151 | 152 | - Upgrading viem to at least v2.15.0 to use their nonceManager to be aware of pending nonces 153 | - Aggressive pricing of blob transactions 154 | - The L1TxUtils will be able to speed up Tx1 (even if it reverts), which should unblock Tx2 155 | 156 | ### Setup 157 | 158 | There will be an optional environment variable `sequencer.customForwarderContractAddress` that can be used to specify a custom forwarder contract address. 159 | 160 | If this is not set, the sequencer will deploy the Aztec Labs implementation of the forwarder contract, using the Universal Deterministic Deployer, supplying the sequencer's address as the deployment salt, and the sequencer's address as the owner. 161 | 162 | ### Gas 163 | 164 | Including signatures, the calldata for a propose transaction is ~3.6KB. 165 | 166 | So the overhead of the forwarder contract is: 167 | 168 | 1. **Calldata Copying**: 169 | 170 | - Copy cost: 3 gas per 32‑byte word 171 | - 3600 bytes / 32 bytes per word ~= 113 words 172 | - 113 words × 3 gas ≈ 339 gas 173 | - Memory expansion cost: Memory is priced using the formula C(m) = 3\*m + floor(m²/512) for m words. 174 | - For 113 words: 3×113 + floor(113²/512) = 339 + 24 ≈ 363 gas 175 | - Total for copying: 339 + 363 ≈ 702 gas 176 | 177 | 2. **Call Overhead**: 178 | - The basic cost of a call is about 700 gas. 179 | 180 | **Adding it up**: 181 | 182 | - 702 gas (copying/calculation) + 700 gas (call) ≈ 1402 gas 183 | 184 | Operating at 10TPS, this means an overhead of under 1402 gas / (10 transactions/s \* 36s) = 3.9 gas per L2 transaction. 185 | 186 | ### Future work 187 | 188 | For more robust cancellation, the sequencer client could maintain a pool of available EOAs, each of which are "owners"/"authorized senders" on its forwarder contract, and use one until it gets stuck, then switch to the next one: presumably by the time the sequencer client gets to the original EOA, the blob pool will have been cleared. 189 | 190 | ### Alternative solutions 191 | 192 | The original problem was voting at the same time as proposing an L2 block. 193 | 194 | The sequencer client could have done the voting in its first L1 slot available, and delayed production of the L2 block until the next L1 slot. 195 | 196 | This is unacceptable since the L2 blocks should eventually be published in the _first_ L1 slot available, to give the greatest chance of getting the L2 block included within our L2 slot. 197 | 198 | Alternatively, the EmpireBase contract could have an additional address specified by validators, specifying a separate address that would be used for governance voting. 199 | 200 | This seemed more complex, and has a similar problem when considering the flow where a proposer tries to vote instead of building a block (because there were no transactions at the start of the slot), but then a transaction became available, and they tried to build/propose an L2 block in the same slot; delays or other queueing in the sequencer client would be required regardless. 201 | 202 | ## Change Set 203 | 204 | Fill in bullets for each area that will be affected by this change. 205 | 206 | - [ ] Cryptography 207 | - [ ] Noir 208 | - [ ] Aztec.js 209 | - [ ] PXE 210 | - [ ] Aztec.nr 211 | - [ ] Enshrined L2 Contracts 212 | - [ ] Private Kernel Circuits 213 | - [x] Sequencer 214 | - [ ] AVM 215 | - [ ] Public Kernel Circuits 216 | - [ ] Rollup Circuits 217 | - [x] L1 Contracts 218 | - [x] Prover 219 | - [ ] Economics 220 | - [ ] P2P Network 221 | - [ ] DevOps 222 | 223 | ## Test Plan 224 | 225 | The primary test is [cluster governance upgrade](https://github.com/AztecProtocol/aztec-packages/issues/9638), ensuring that block production does not stall (as it currently does). 226 | 227 | ## Documentation Plan 228 | 229 | No plans to document this as yet: the node operator guide effectively does not exist. 230 | 231 | ## Disclaimer 232 | 233 | The information set out herein is for discussion purposes only and does not represent any binding indication or commitment by Aztec Labs and its employees to take any action whatsoever, including relating to the structure and/or any potential operation of the Aztec protocol or the protocol roadmap. In particular: (i) nothing in these projects, requests, or comments is intended to create any contractual or other form of legal relationship with Aztec Labs or third parties who engage with this AztecProtocol GitHub account (including, without limitation, by responding to a conversation or submitting comments) (ii) by engaging with any conversation or request, the relevant persons are consenting to Aztec Labs' use and publication of such engagement and related information on an open-source basis (and agree that Aztec Labs will not treat such engagement and related information as confidential), and (iii) Aztec Labs is not under any duty to consider any or all engagements, and that consideration of such engagements and any decision to award grants or other rewards for any such engagement is entirely at Aztec Labs' sole discretion. Please do not rely on any information on this account for any purpose - the development, release, and timing of any products, features, or functionality remains subject to change and is currently entirely hypothetical. Nothing on this account should be treated as an offer to sell any security or any other asset by Aztec Labs or its affiliates, and you should not rely on any content or comments for advice of any kind, including legal, investment, financial, tax, or other professional advice. 234 | -------------------------------------------------------------------------------- /in-progress/12487-prd-fast-sync.md: -------------------------------------------------------------------------------- 1 | # Fast-Sync Project Requirements Document 2 | 3 | - Owners: @spalladino, @aminsammara 4 | - Approvers: 5 | - @AndreOxski 6 | - @charlielye 7 | - @PhilWindle 8 | - @joshcrites 9 | 10 | ## Background 11 | 12 | Syncing an L2 node requires crawling L1 since L2 genesis, downloading events and blobs, and updating world-state for every update. This is time-consuming, and proportional to the length of the chain. 13 | 14 | ## Desired User Flow 15 | 16 | Node operators should be able to sync the chain in nearly constant time by downloading a recent snapshot, and syncing only the most recent blocks. Operators should be able to do this by specifying a snapshot URL to sync from. 17 | 18 | This flow is available in most Ethereum clients. 19 | 20 | ### Node operators 21 | 22 | Users SHOULD be able to specify which method to use to sync their clients. Methods are either `fast` (snapshots) or `full` (for syncing from the chain). Default is `fast`. For example: 23 | `aztec start --node --publisher --sync-method fast` 24 | 25 | Users SHOULD also be able to specify the location for the snapshot. Protocols supported are ipfs and https. For example: 26 | `aztec start --node --publisher --sync-method fast --snapshot-url ipfs://...` 27 | 28 | Users SHOULD also be able to specify the location for a snapshot index, which is a machine-readable file that lists the latest snapshots. Protocols supported https only. For example: 29 | `aztec start --node --publisher --sync-method fast --snapshot-index https://...` 30 | 31 | ### Labs 32 | 33 | Snapshots for external networks must be generated and uploaded by Labs to IPFS on at most a 2-week basis, ideally weekly. This process should be automated. 34 | 35 | ## Requirements 36 | 37 | - Snapshots can be client-specific. 38 | - Snapshots are not to downloaded via p2p. 39 | - Any regular node should be able to generate the snapshot with a given command. 40 | - Nodes must be able to specify which snapshot to sync from by supplying an IPFS CID or URL. 41 | - The IPFS gateway used by the node is configurable, and has a reasonable default (eg node run by Labs or well-known service). 42 | - Nodes must also be able to specify a snapshot index, and pick the latest one listed. 43 | - The time for syncing from a snapshot (without validating it) should not depend on the chain length (other than I/O). 44 | 45 | ### Snapshots index 46 | 47 | The format of the snapshots index is left open, but must include the L2 block hash and number, corresponding L1 block number, and either an https or ipfs identifier for obtaining the corresponding snapshot data. This file must be machine-readable, ideally JSON. 48 | 49 | ### Trust assumptions 50 | 51 | For an initial version of this feature, snapshots are trusted, meaning that the downloader trusts the uploader and does not verify its integrity. It only checks that the resulting world state root and latest archive match valid on-chain values. 52 | 53 | ### Hosting and defaults 54 | 55 | Aztec Labs is to generate these snapshots on a weekly basis (preferred, every 2 weeks at least) and upload them to IPFS. Labs is to ensure the availability of this snapshot, either by directly hosting a node or by relying on a pinning service. 56 | 57 | Snapshots should be listed in the snapshots index, stored in a well-known location (eg S3 bucket) hosted by Labs. Nodes will default to this S3 index, pick the latest snapshot, and use fast-sync by default. 58 | 59 | ### Snapshot contents 60 | 61 | The snapshot should contain all information needed for the client to be considered synced up to a given L2 chain tip. For the current client being built, this is the archiver and world state databases. Snapshot should not include any data whatsoever from the P2P layer. This should only be data syncable from L1. The resulting state of downloading a snapshot should be indistinguishable from a full sync from L1. 62 | -------------------------------------------------------------------------------- /in-progress/12639-prd-inactivity-leak.md: -------------------------------------------------------------------------------- 1 | # Detecting Slashable Validators: Project Requirements Document 2 | 3 | - Owner: @spalladino 4 | - Approvers: 5 | - @aminsammara 6 | - @charlielye 7 | - @PhilWindle 8 | - @maddiaa 9 | - @LHerskind 10 | 11 | ## Background 12 | 13 | We currently have a slashing mechanism built on top of governance, where sequencers can vote to slash a node if agreed. Issue with this approach is that slashing takes time, and incentives (ie not wanting to lose money!) play a big role in it. 14 | 15 | However, during early testnet phases, where nodes do not have so much at stake, and failures are to be expected, we want to slash fast to ensure we don't halt block production due to a significant portion of validators being offline. 16 | 17 | So, for testnet only, we want to have a centralized entity who can rapidly slash or kick out validators from the set. This process can be manual, but we need to provide the slasher with accurate info on who to slash. 18 | 19 | ## Desired User Flow 20 | 21 | The testnet Aztec Slasher (or AS for short, after their initials) must be able to gather all info needed for slashing from an Aztec Node under their control. Note that given slashing is centralized, we can use information from a single trusted node to decide who to slash. 22 | 23 | AS should be able to access this information either via a dashboard, or by hitting their node RPC interface and getting the data in JSON format. It must be immediately clear what nodes needs to be slashed from this data. For example: 24 | 25 | ``` 26 | $ curl -XPOST https://my-node/ -d'{"method": "node_getSlashable"}' 27 | { 28 | "synchedTo": { "l1Block": 1, "l2Block": 1, "l2BlockHash": "0xabcd" } , 29 | "validators": [ 30 | { "address": "0x01", "lastBlockProposedAt": "2025-01-01T01:01:01", "lastBlockAttestedAt": "2025-01-01T01:01:01", "missedAttestationsStreak": 20, "missedProposalsStreak": 5, "missedProposalsRate": 0.1, "missedAttestationsRate": 0.2, "joinecAt": "2025-01-01T01:01:01" } 31 | ] 32 | } 33 | ``` 34 | 35 | Once the addresses to slash have been gathered, AS, using a set of privileged L1 keys, should be able to call the L1 slasher contract to execute the slash in a single tx. This should be either executed through the Aztec CLI, or the Aztec CLI should generate the payload to run through cast. For example: 36 | 37 | ``` 38 | $ aztec slash --slasher-client-address 0xabcd --private-key 0x1234 0x01 0x02 39 | ``` 40 | 41 | ## Requirements 42 | 43 | - Data must be clear on which addresses must be slashed, clear on why the addresses were selected, and must be updated every epoch. 44 | - Given a minimum number of `1` active and online validator, the network MUST always eventually resume block building regardless of how many other validators went offline. 45 | 46 | ### Identifying slashable validators 47 | 48 | - Consider both block proposals and attestations. 49 | - Missed attestations and proposals are reported both as current streak and rolling average. 50 | - Current streak is defined as the number of missed attestations or proposals since the last activity. This is useful for detecting non-malicious validators that may be offline. 51 | - Rolling average is defined as the number of missed attestations or proposals over the total expected for the last N epochs, where N is configurable and defaults to 50. This will be useful for detecting malicious or unreliable validators long-term. 52 | - Attestations should be gathered from both L1 and the p2p attestation pool. 53 | - Rationale is that block proposers post to L1 only the attestations they need, so we need to look into the attestation pool to gather all of them. We also look into L1 since our node is not guaranteed to receive all attestations via p2p in case of a p2p issue. 54 | - Proposals should be gathered from both L1 and the p2p attestation pool. 55 | - Rationale is that if a proposal cannot be posted due not not enough attestations (because more than 1/3 of the committee is down) we don't want to punish the proposer. 56 | - List of slashable validators should include all validators that have missed at least one attestation or proposal (since their time of last activity). 57 | - AS should then filter the resulting data based on how aggressive they want to be in slashing, using `jq` or a script. 58 | - An address that fulfilled all their duties but had no activity due to not being selected for a committee must not be selected to be slashed. 59 | - A validator selected for attestation must not be counted towards a missed attestation if there was no proposal seen for that slot. 60 | 61 | ## Future work 62 | 63 | Slashing via CLI is a temporary measure so AS can quickly remove unreliable validators that are holding back the network. We know that we'll have to build a proper decentralized mechanism for executing slashing in the near future, but this is a stopgap solution that is low effort to implement, and builds on top of identifying the slashable validators which will be reused. 64 | -------------------------------------------------------------------------------- /in-progress/5040-native-merkle-trees-napi.md: -------------------------------------------------------------------------------- 1 | | | | 2 | | -------------------- | --------------------------------- | 3 | | Issue | [Native Merkle Trees](https://github.com/AztecProtocol/aztec-packages/issues/5040) | 4 | | Owners | @alexghr @PhilWindle | 5 | | Approvers | @just-mitch @spalladino @ludamad @charlielye @fcarreiro | 6 | | Target Approval Date | 2024-07-05 | 7 | 8 | ## Executive Summary 9 | 10 | This document proposes integrating the [Native Merkle Trees database](https://github.com/AztecProtocol/engineering-designs/blob/f9d1a897303c1481c790cecc4616961e1c183622/in-progress/0003-native-merkle-trees.md) directly into the TypeScript project using a native module written in C++ using [Node-API](https://nodejs.org/docs/latest-v18.x/api/n-api.html) rather than message passing. 11 | 12 | ## Introduction 13 | 14 | The original native Merkle tree spec proposed building a `MerkleTreesDb` native binary in C++. The TypScript code would use message passing over streams to communicate with the database. A long lived process would be started once and accept messages over an input stream (e.g. stdin or a socket), process the messages and return the result over another stream (e.g. stdout). 15 | 16 | [Node-API](https://nodejs.org/docs/latest-v18.x/api/n-api.html) is an API for building native addons that integrate seamlessly into NodeJS. 17 | 18 | This approach would simplify deployment and maintenance (no new binaries need to be managed/started) while providing an easier to use interface from the TypeScript side. 19 | 20 | ## Interface 21 | 22 | A new module would be written in C++ that would adapt the existing Native Merkle Trees database to Node-API semantics. This module could sit alongside the stream-based message passing implementation detailed in the [original spec](https://github.com/AztecProtocol/engineering-designs/blob/f9d1a897303c1481c790cecc4616961e1c183622/in-progress/0003-native-merkle-trees.md#interface) 23 | 24 | This module would be build with CMake normally as the rest of the C++ code, with the exception that its build artifact would be a shared library (with a custom extension `.node` instead of `.so`). The TypeScript project would use [`bindings`](https://www.npmjs.com/package/bindings) to load the native module and re-export the functions and classes from C++. 25 | 26 | > [!NOTE] 27 | > TypeScript definitions would have to be written from the C++ code. Ideally these would be generated from existing code, but if that doesn't work then they would have to be written and maintained manually. 28 | 29 | ## Implementation 30 | 31 | The implementation would use the [Node Addon API](https://github.com/nodejs/node-addon-api) instead of Node-API directly. Node Addon API is a C++ wrapper (by the Nodejs team) of N-API and exposes an object oriented interface to N-API. 32 | 33 | ```tree 34 | barretenberg/cpp/src/barretenberg 35 | # other modules 36 | ├── crypto 37 | │   └── merkle_tree # tree implementations, leaf types, lmdb integration, etc 38 | ├── world_state # equivalent of MerkleTrees from TypeScript 39 | ├── world_state_napi # <--- the proposed new module 40 | └── world_state_service # binary using message passing 41 | ``` 42 | 43 | ### Addon 44 | 45 | The module would export a single Addon class: 46 | 47 | ```cpp 48 | // world_state_addon.hpp 49 | class WorldStateAddon : public Napi::ObjectWrap { 50 | public: 51 | WorldStateAddon(const Napi::CallbackInfo&); 52 | 53 | Napi::Value getTreeMetaData(const Napi::CallbackInfo&); 54 | Napi::Value getSiblingPath(const Napi::CallbackInfo&); 55 | // etc other methods from the public API of [MerkleTrees in TS](https://github.com/AztecProtocol/aztec-packages/blob/88d43e753079f9b0c263b655bfd779c2098e9097/yarn-project/world-state/src/world-state-db/merkle_trees.ts) 56 | 57 | static Napi::Function get_class(Napi::Env); 58 | 59 | private: 60 | std::unique_ptr _world_state_svc; 61 | }; 62 | ``` 63 | 64 | ```cpp 65 | // world_state_addon.cpp 66 | WorldStateAddon::WorldStateAddon(const Napi::CallbackInfo& info) 67 | : ObjectWrap(info) 68 | { 69 | Napi::Env env = info.Env(); 70 | 71 | if (info.Length() < 1) { 72 | Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException(); 73 | return; 74 | } 75 | 76 | if (!info[0].IsString()) { 77 | Napi::TypeError::New(env, "Directory needs to be a string").ThrowAsJavaScriptException(); 78 | return; 79 | } 80 | 81 | std::string data_dir = info[0].ToString(); 82 | _world_state_svc = std::make_unique(data_dir); 83 | } 84 | 85 | Napi::Value WorldStateAddon::getLeafValue(const Napi::CallbackInfo& info) 86 | { 87 | auto env = info.Env(); 88 | Napi::Promise::Deferred deferred(env); 89 | 90 | auto tree_id = info[0].As(); 91 | bool lossy; 92 | bb::crypto::merkle_tree::index_t leaf_index = info[0].As().Uint64Value(&lossy); 93 | if (lossy) { 94 | deferred.Reject(Napi::TypeError::New(env, "Invalid leaf index").Value()); 95 | return deferred.Promise(); 96 | } 97 | 98 | bool include_uncomitted = info[2].As(); 99 | 100 | // pointer to helper class for async code (gets cleaned up later), see below 101 | auto* tree_op = new bb::world_state::TreeOp(env, deferred, [=]() { 102 | bb::crypto::merkle_tree::Signal signal(1); 103 | bb::fr leaf(0); 104 | auto callback = [&](bb::fr& value) { 105 | leaf = value; 106 | signal.signal_level(0); 107 | }; 108 | // for illustration purposes only, actual function call will be different 109 | _world_state_svc[tree_id].get_leaf_value(leaf_index, include_uncomitted, callback); 110 | signal.wait_for_level(0); 111 | return leaf; 112 | }); 113 | 114 | tree_op ->Queue(); 115 | 116 | return deferred.Promise(); 117 | } 118 | // etc. 119 | 120 | // init the module 121 | Napi::Function WorldStateAddon::get_class(Napi::Env env) 122 | { 123 | return DefineClass(env, "WorldState", 124 | { 125 | WorldStateAddon::InstanceMethod("getLeafValue", &WorldStateAddon::getLeafValue), 126 | // other instance methods 127 | }); 128 | } 129 | 130 | Napi::Object Init(Napi::Env env, Napi::Object exports) 131 | { 132 | Napi::String name = Napi::String::New(env, "WorldState"); 133 | exports.Set(name, WorldStateAddon::get_class(env)); 134 | return exports; 135 | } 136 | 137 | NODE_API_MODULE(addon, Init) 138 | ``` 139 | 140 | > [!NOTE] 141 | > The instance methods on the C++ class will be exported as instance methods on the JavaScript instance too. 142 | > Instance methods _must_ return a Napi::Value (ie. `any` in TS-land), even though the method returns something more specific (e.g. a Promise) and accept a single Napi::Callback parameter. 143 | > Instance methods can not be `const`. 144 | 145 | The equivalent TS code would look like this: 146 | 147 | ```ts 148 | const bindings = require('bindings'); // from the bindings npm package 149 | const { WorldState }= bindings('world_state_napi'); // looks for the dynamic library named world_state_napi.node in a set of known folders (relative to package.json) 150 | 151 | async function main() { 152 | const worldState = new WorldState('./data'); // WorldState is the name under which the C++ class was exported 153 | const firstLeaf = await worldState.getLeafValue("notes_tree", 0, false); 154 | console.log(Fr.fromString(firstLeaf)); 155 | 156 | await worldState.handleL2BlockAndMessages(L2Block.random(), []); 157 | console.log(Fr.fromString(await worldState.getLeafValue('notes_tree', 0, false))); 158 | } // as soon as main finishes executing, `worldState` goes out of scope and at some point gets garbage collected which in turn calls its C++ destructor. 159 | 160 | main(); 161 | ``` 162 | 163 | ### Classes & instances 164 | 165 | Exported classes from the C++ side can be used an instantiated from NodeJS. Node Addon API is responsible for the glue code that ties to two together (ie. calling a function on the JS object calls the appropriate function in C++). The JS instance is a reference to the instance inside C++. The C++ instance is able to refer instantiate any other classes or allocate and access as much memory as needed. 166 | 167 | When an instance is garbage collected on the TS-side, the destructor is called on the C++ side. 168 | 169 | ### Passing data between NodeJS and C++ 170 | 171 | The `Napi` namespace on the C++ contains helper classes to deal with JS primitive values. Strings, numbers, bigints, buffers, arrays, typed arrays and even functions can be freely passed between the two environments. 172 | 173 | More complex data structures must be serialized/deserialized. We will msgpack for this as it's already implemented in the C++ code 174 | 175 | [`Napi::Value` documentation](https://github.com/nodejs/node-addon-api/blob/cc06369aa4dd29e585600b8b47839c1297df962d/doc/value.md) 176 | 177 | ### Message passing 178 | 179 | Instead of exporting a functions for each operation on the world state, we could instead leverage the existing message passing interface, only instead sending the message across the TS/C++ boundary. This would simplify the module initialization code on the C++ side (only requiring we export a single function) and we'd benefit from easily serializing data types with msgpack. 180 | 181 | ### Async code 182 | 183 | The C++ code gets executed on the main Nodejs thread. Care has to be taken not to block the thread since that would prevent other JS code from running until the callback is finished. 184 | 185 | Running normal async code on the main thread is not supported: 186 | 187 | ```cpp 188 | Napi::Value WorldStateAddon::getMetaData(const Napi::CallbackInfo& info) 189 | { 190 | Napi::Promise::Deferred deferred(env); 191 | bb::crypto::merkle_tree::Signal signal(1); 192 | 193 | // getting the meta data directly from a merkle tree using callbacks 194 | auto completion = [&](const std::string&, uint32_t, const bb::crypto::merkle_tree::index_t&, const bb::fr& r) -> void 195 | { 196 | deferred.Resolve(Napi::String::New(env, format(r))); 197 | signal.signal_level(0); 198 | }; 199 | 200 | _notes_tree->get_meta_data(false, completion); 201 | signal.wait_for_level(0); 202 | 203 | return deferred.Promise(); 204 | } 205 | ``` 206 | 207 | In the context of running inside the Nodejs runtime the code above has undefined behavior. It could segfault or hang indefinitely. 208 | 209 | The correct way of running async operations is to wrap the code in an [AsyncWorker](https://github.com/nodejs/node-addon-api/blob/cc06369aa4dd29e585600b8b47839c1297df962d/doc/async_worker.md) so that the Nodejs runtime can track its execution properly: 210 | 211 | ```cpp 212 | using tree_op_callback = std::function; 213 | class TreeOp : public AsyncWorker { 214 | public: 215 | TreeOp(Napi::Env env, Promise::Deferred& deferred, tree_op_callback& callback) 216 | : AsyncWorker(env) 217 | , _callback(callback) 218 | , _deferred(deferred) 219 | , _result(0) 220 | {} 221 | 222 | ~TreeOp() override = default; 223 | 224 | void Execute() override 225 | { 226 | try { 227 | _result = _callback(); 228 | } catch (const std::exception& e) { 229 | SetError(e.what()); 230 | } 231 | } 232 | 233 | void OnOK() override { _deferred.Resolve(String::New(Env(), format(_result))); } 234 | void OnError(const Napi::Error& e) override { _deferred.Reject(e.Value()); } 235 | 236 | private: 237 | tree_op_callback _callback; 238 | Promise::Deferred _deferred; 239 | bb::fr _result; 240 | } 241 | ``` 242 | 243 | `AsyncWorker.Queue` enqueues the execution of the worker at a later time on a thread managed by Node's libuv runtime. 244 | 245 | > [!IMPORTANT] 246 | > Inside `Execute()` code _must not_ access the JavaScript environment. This means everything that's needed to complete the operation _must_ be copied from the JS environment to memory owned by C++ before the task is queued up. 247 | > This also means that `Execute()` _can not_ create instances of `Napi::Value` since it does not have access to a `Napi::Env`. 248 | 249 | The `Execute` function runs on a separate libuv thread. The code is then able to fan out work to other system threads. Once the async code finishes executing on the worker thread, one of the two event callbacks gets run on _the main NodeJS thread_. At this point the result of the async operation must be turned into a `Napi::Value` and returned back to the NodeJS code. 250 | 251 | AsyncWorker instances have to be pointers otherwise they'd get destroyed as soon as the sync function that created them finishes executing. Enqueueing a worker makes N-API/libuv responsible for clean up after the worker reports its result back to NodeJS. 252 | 253 | On the NodeJS side, C++ code wrapped in an `AsyncWorker` runs independent of the event loop. This means that the event loop is able to continue executing other queued up work while the C++ runs in the background to resolve its promise. 254 | 255 | ### Memory limit 256 | 257 | NodeJS has a heap limit of about 4GB by default. This limit does not apply to the C++ module. The following code was used to allocate 40GB of RAM inside of a NodeJS process: 258 | 259 | ```cpp 260 | // world_state_addon.hpp 261 | class WorldStateAddon : public Napi::ObjectWrap { 262 | // ... 263 | private: 264 | std::vector> _data; 265 | } 266 | 267 | // world_state_addon.cpp 268 | WorldStateAddon::WorldStateAddon(const Napi::CallbackInfo& info) 269 | : ObjectWrap(info) 270 | { 271 | // 40 * 1GB chunks 272 | size_t chunks = 40; 273 | for (size_t i = 0; i < chunks; i++) { 274 | this->_data.emplace_back(1024 * 1024 * 1024); 275 | } 276 | } 277 | ``` 278 | 279 | ![40GB RAM](../images/node40gb.png) 280 | 281 | ### Error handling 282 | 283 | Unhandled exceptions in the C++ code will crash the NodeJS process. Errors must be propagated correctly to the JS side if they can not be handled in C++. 284 | 285 | The C++ exceptions flag will be turned on at compile time so exception bubble naturally to the JS side. For async code errors should be returned by rejecting the associated promises. 286 | 287 | [Error handling documentation](https://github.com/nodejs/node-addon-api/blob/cc06369aa4dd29e585600b8b47839c1297df962d/doc/error_handling.md) 288 | 289 | ### Build changes 290 | 291 | The Node Addon API is distributed as an npm package (even though it contains C++ code). The new `world_state_napi` module would need to have a small `package.json` specifying the right version of the library: 292 | 293 | ```json 294 | { 295 | "name": "@aztec/world_state_napi", 296 | "version": "0.0.0", 297 | "dependencies": { 298 | "node-addon-api": "^8.0.0", 299 | "node-api-headers": "^1.1.0" 300 | }, 301 | "binary": { 302 | "napi_versions": [9] 303 | } 304 | } 305 | ``` 306 | 307 | The CMake build script for this module would then have to add the the code from `node_modules` to the module's dependency list: 308 | 309 | ```cmake 310 | # the require command outputs the path with double quotes and new lines 311 | execute_process( 312 | COMMAND node -p "require('node-addon-api').include" 313 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 314 | OUTPUT_VARIABLE NODE_ADDON_API_DIR 315 | ) 316 | 317 | # strip the quotes and new lines 318 | string(REGEX REPLACE "[\r\n\"]" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR}) 319 | target_include_directories(world_state_napi PRIVATE ${NODE_ADDON_API_DIR}) 320 | 321 | # similar for node-api-headers 322 | ``` 323 | 324 | ### PIC 325 | 326 | Position independent code (`-fPIC` compiler flag) has to be enabled for bb libraries since the `world_state_napi` will be a shared library. 327 | 328 | ## Change Set 329 | 330 | Fill in bullets for each area that will be affected by this change. 331 | 332 | - [ ] L1 Contracts 333 | - [ ] Enshrined L2 Contracts 334 | - [ ] Private Kernel Circuits 335 | - [ ] Public Kernel Circuits 336 | - [ ] Rollup Circuits 337 | - [ ] Aztec.nr 338 | - [ ] Noir 339 | - [ ] AVM 340 | - [x] Sequencer 341 | - [ ] Fees 342 | - [ ] P2P Network 343 | - [ ] Cryptography 344 | - [ ] DevOps 345 | 346 | ## Test Plan 347 | 348 | The `world_state` module (pure C++ working directly with trees) will continue to be extensively unit tested. The `world_state_napi` (the node-addon-api wrapper) module will instead be tested as part of running the aztec-node. 349 | 350 | ## Documentation Plan 351 | 352 | N/A 353 | 354 | ## Rejection Reason 355 | 356 | N/A 357 | 358 | ## Abandonment Reason 359 | 360 | N/A 361 | 362 | ## Implementation Deviations 363 | 364 | N/A 365 | -------------------------------------------------------------------------------- /in-progress/7025-instrumenting-the-node-with-open-telemetry.md: -------------------------------------------------------------------------------- 1 | | | | 2 | | -------------------- | --------------------------------------------------------------------------- | 3 | | Issue | [Node metrics](https://github.com/AztecProtocol/aztec-packages/issues/7025) | 4 | | Owners | @alexghr | 5 | | Approvers | | 6 | | Target Approval Date | 2024-06-14 | 7 | 8 | ## Executive Summary 9 | 10 | The node should emit useful stats about the way its running so that node operators can monitor its performance and resource usage. 11 | 12 | ## Introduction 13 | 14 | In order to confidently deploy and maintain a node in production it needs to provide basic information about how it's operating. These metrics need to be emitted in portable manner so that monitoring tools can easily ingest them. These metrics should be optional such that running a node does not require running any other infrastructure to ingest the metrics. 15 | 16 | OpenTelemetry is a framework for capturing instrumentation data from applications and encoding them into a standard format that's vendor neutral. In the past we've used Prometheus and Grafana to capture metrics from services, OpenTelemetry would enable us to continue running that stack while also giving the community the chance to ingest data into different systems (e.g. Clickhouse, DataDog). 17 | 18 | Initially the node metrics will include system usage and the Aztec stats emitted during benchmarks, but the metrics service should be flexible enough to accept new stats as they are developed. 19 | 20 | ## Interface 21 | 22 | Enabling metrics in the node would require running an OpenTelemetry Collector to batch the data, a Prometheus instance to ingest all of this data and a Grafana dashboard to render the data. As part of this work the Docker Compose file used to run the Sandbox will be updated to optionally start these three services up. 23 | 24 | ## Implementation 25 | 26 | OpenTelemetry supports three types of instrumentations: metrics, traces and logs. The immediate goal is to get metrics from the node. Traces and logs will be left for a future iteration. 27 | 28 | The OpenTelemetry framework is made up of two components: the API and the SDK. Library code is meant to import and use the API to emit traces, metrics and logs while applications are meant to initialize the SDK. The SDK acts as a "backend" for the API, without it every trace, metric and log become no-ops. This would allow to conditionally initialize the SDK depending on whether stats have been enabled by the user. 29 | 30 | > [!NOTE] 31 | > The `@opentelemetry/api` and `@opentelemetry/sdk-node` packages use global scope for the backend. This means we can't have two services initialize two SDKs in the same NodeJS process as they'd clash with each other (e.g. running both the node and the pxe in the same process and both initialize an SDK instance.) 32 | 33 | ### Update 34 | 35 | The OpenTelemetry package was wrapped in a custom `@aztec/telemetry` sub-package in the workspace in order to provide custom attributes, metric names and utility functions. See [PR](https://github.com/AztecProtocol/aztec-packages/pull/7102) 36 | 37 | ### Naming 38 | 39 | The OpenTelemetry specification already includes guidelines on telemetry naming. We'll follow the established guidelines: 40 | 41 | - we will prefix all metrics with `aztec` 42 | - we will use full stops `.` to separate components of a name (e.g. `aztec.circuit.simulation`) 43 | - we will use base units for values (e.g. `bytes` over `kilobytes` and `seconds` over `milliseconds`) 44 | - we will _not_ include the unit in the name (there's a separate attribute for that, see code examples) 45 | - custom attributes should only be created if an existing semantic attributes does not exist already 46 | - attribute and metric names exist within the same hierarchy and must be unique 47 | - meter names must the class that's being instrumented. In some cases it is acceptable to name a meter with the package name (if the package is instrumented as a whole) 48 | 49 | [Metrics guidelines & naming conventions](https://opentelemetry.io/docs/specs/semconv/general/metrics/) 50 | 51 | > [!NOTE] 52 | > Prometheus does not accept full-stops `.` in metric names and will replace them with underscores `_` currently. This will change once it fully supports the OpenTelemetry specification. See [prometheus/prometheus#13095](https://github.com/prometheus/prometheus/issues/13095) 53 | 54 | ### Benchmark metrics 55 | 56 | > [!NOTE] 57 | > The implementation has deviated from this code sample, see [below](#implementation-deviations) 58 | 59 | All call sites that emit benchmark stats would be extended to also emit an identical metric via OTel: 60 | 61 | ```ts 62 | // yarn-project/simulator/src/public/public_kernel.ts 63 | import { metrics } from '@opentelemetry/api'; 64 | export class RealPublicKernelCircuitSimulator implements PublicKernelCircuitSimulator { 65 | // ... 66 | private meter = metrics.getMeter('RealPublicKernelCircuitSimulator'); 67 | private circuitSimulationMetric = this.meter.createHistogram('aztec.circuit.simulation.duration', { 68 | unit: 's', 69 | }); 70 | 71 | public async publicKernelCircuitSetup(input: PublicKernelCircuitPrivateInputs): Promise { 72 | // ... 73 | 74 | this.circuitSimulationMetric.record(duration / 1000, { 75 | "aztec.circuit.name": 'public-kernel-setup', 76 | "aztec.circuit.input.size": input.toBuffer().length, 77 | "aztec.circuit.output.size": result.toBuffer().length, 78 | }); 79 | 80 | this.log.debug(`Simulated public kernel setup circuit`, { 81 | // ... 82 | } satisfies CircuitSimulationStats); 83 | } 84 | } 85 | ``` 86 | 87 | > [!NOTE] 88 | > Grafana dashboards will use attribute names to plot individual circuits as needed. 89 | > The input and output size attributes are better attached to a span instead of a histogram record. 90 | 91 | [Metrics API](https://opentelemetry.io/docs/specs/otel/metrics/api/) 92 | 93 | ### SDK initialization 94 | 95 | > [!NOTE] 96 | > The implementation has deviated from this code sample, see [below](#implementation-deviations) 97 | 98 | ```ts 99 | // yarn-project/aztec-node/src/aztec-node/server.ts 100 | export class AztecNodeService implements AztecNode { 101 | private sdk: NodeSDK; 102 | 103 | public static async createAndSync(config: AztecNodeConfig) { 104 | const sdk = new NodeSDK({ 105 | traceExporter: new ConsoleSpanExporter(), 106 | metricReader: new PeriodicExportingMetricReader({ 107 | exporter: new ConsoleMetricExporter(), 108 | }), 109 | instrumentations: [getNodeAutoInstrumentations()], 110 | }); 111 | 112 | sdk.start(); 113 | // ... 114 | } 115 | 116 | public stop() { 117 | await this.sdk.stop(); 118 | // ... 119 | } 120 | } 121 | ``` 122 | 123 | The `getNodeAutoInstrumentations` function from [@opentelemetry/auto-instrumentations-node](https://www.npmjs.com/package/@opentelemetry/auto-instrumentations-node) sets up instrumentation for a [large number of external packages](https://github.com/open-telemetry/opentelemetry-js-contrib/tree/main/metapackages/auto-instrumentations-node#supported-instrumentations), the vast majority of which are not being used by Aztec. It will be replaced with smaller set of instrumentations that's relevant to Aztec 124 | 125 | > [!NOTE] 126 | > Starting the SDK will set up the global instances accessible through the `@opentelemetry/api` package so that even external packages can use the backend to expose metrics. 127 | 128 | ### System metrics 129 | 130 | The SDK will be initialized with the [`@opentelemetry/host-metrics`](https://www.npmjs.com/package/@opentelemetry/host-metrics) package to track system resources. 131 | 132 | ### Exporting data 133 | 134 | The main difference between OpenTelemetry and using Prometheus directly is that OpenTelemetry uses a push model (where services push metrics directly to a service/collector), whereas Prometheus uses a pull model (where the prometheus instance scrapes each service individually). Integrating OpenTelemetry would require us to run one extra service in the stack, the OpenTelemetry Collector, that will collect and batch metrics from nodes. 135 | 136 | The Sandbox docker compose file will be updated to include an optional OTel connector, Prometheus instance and Grafana dashboard. They will be turned on using compose profiles 137 | 138 | ```sh 139 | docker compose up --profile metrics 140 | ``` 141 | 142 | The Terraform IaC will also be updated to deploy an OTel collector, a Prometheus instance and a Grafana dashboard. 143 | 144 | The Grafana dashboard will contain the benchmark stats and system resource utilization for a node. The dashboard will be stored in the repo as code so it's easy to maintain and extend (instead of manually modifying the dashboard in the GUI). 145 | 146 | ### Existing benchmark tests 147 | 148 | The existing benchmark tests will remain unmodified for now. Ideally we'd find a way to capture the metrics data during test runs and recreate the Markdown comment without the need for logging benchmark metrics to stdout. Given the pluggable architecture of OpenTelemetry we should be able to initialize the SDK with a custom `MetricsExporter` that sends the data directly to the existing ndjson files. This would enable us to remove the dependency on `winston`. 149 | 150 | ## Change Set 151 | 152 | Fill in bullets for each area that will be affected by this change. 153 | 154 | - [ ] L1 Contracts 155 | - [ ] Enshrined L2 Contracts 156 | - [ ] Private Kernel Circuits 157 | - [ ] Public Kernel Circuits 158 | - [ ] Rollup Circuits 159 | - [ ] Aztec.nr 160 | - [ ] Noir 161 | - [ ] AVM 162 | - [x] Sequencer 163 | - [ ] Fees 164 | - [ ] P2P Network 165 | - [ ] Cryptography 166 | - [x] DevOps 167 | 168 | ## Test Plan 169 | 170 | 1. All existing tests should continue to work as before when telemetry is off 171 | 2. Optionally write tests that validate that metrics are captured correctly using [custom test exporters](https://opentelemetry.io/docs/concepts/instrumentation/libraries/#testing). 172 | 173 | ## Documentation Plan 174 | 175 | This document will be updated with any deviations from the spec and naming conventions established during implementation. 176 | 177 | ## Rejection Reason 178 | 179 | N/A 180 | 181 | ## Abandonment Reason 182 | 183 | N/A 184 | 185 | ## Implementation Deviations 186 | 187 | ### Wrapped package 188 | 189 | The `@aztec/telemetry` package wraps the OpenTelemetry API providing type safe metric and attribute names and helper functions: 190 | 191 | ```ts 192 | // yarn-project/simulator/src/public/public_kernel.ts 193 | import { type TelemetryClient, type Histogram, Metrics, Attributes } from '@aztec/telemetry'; 194 | 195 | export class RealPublicKernelCircuitSimulator implements PublicKernelCircuitSimulator { 196 | 197 | private circuitSimulationMetric: Histogram; 198 | 199 | constructor(private simulator: SimulationProvider, telemetry: TelemetryClient) { 200 | this.circuitSimulationMetric = telemetry.getMeter('RealPublicKernelCircuitSimulator').createHistogram(Metrics.CIRCUIT_SIMULATION_DURATION, { 201 | unit: 's', 202 | }); 203 | } 204 | 205 | public async publicKernelCircuitSetup(input: PublicKernelCircuitPrivateInputs): Promise { 206 | // ... 207 | 208 | this.circuitSimulationMetric.record(durationMS / 1000, { 209 | [Attributes.CIRCUIT_PROTOCOL_NAME]: 'public-kernel-setup', 210 | }); 211 | 212 | this.log.debug(`Simulated public kernel setup circuit`, { 213 | // ... 214 | } satisfies CircuitSimulationStats); 215 | } 216 | } 217 | ``` 218 | 219 | ### Separate "instrumentation" class 220 | 221 | In some circumstances it makes sense to extract the instrumentation code to a separate class (e.g. when there are multiple possible implementations for an interface): 222 | 223 | ```ts 224 | export class ProverInstrumentation { 225 | private simulationDuration: Histogram; 226 | private witGenDuration: Gauge; 227 | private provingDuration: Gauge; 228 | // etc 229 | 230 | constructor(telemetry: TelemetryClient, name: string = 'bb-prover') { 231 | const meter = telemetry.getMeter(name); 232 | // create instruments using meter 233 | } 234 | 235 | // type-safe histogram update 236 | public recordDuration( 237 | metric: 'simulationDuration' | 'witGenDuration' | 'provingDuration', 238 | circuitName: CircuitName, 239 | timerOrS: Timer | number, 240 | ) { 241 | const s = typeof timerOrS === 'number' ? timerOrS : timerOrS.s(); 242 | this[metric].record(s, { 243 | [Attributes.PROTOCOL_CIRCUIT_NAME]: circuitName, 244 | [Attributes.PROTOCOL_CIRCUIT_TYPE]: 'server', 245 | }); 246 | } 247 | } 248 | 249 | export class BBNativeRollupProver implements ServerCircuitProver { 250 | private instrumentation: ProverInstrumentation; 251 | constructor(private config: BBProverConfig, telemetry: TelemetryClient) { 252 | this.instrumentation = new ProverInstrumentation(telemetry, "BBNativeRollupProver"); 253 | } 254 | 255 | private async createRecursiveProof() { 256 | // ... 257 | this.instrumentation.recordDuration('provingDuration', circuitName, provingResult.durationMs / 1000); 258 | } 259 | } 260 | 261 | export class TestCircuitProver implements ServerCircuitProver { 262 | private instrumentation: ProverInstrumentation; 263 | 264 | constructor(telemetry: TelemetryClient) { 265 | this.instrumentation = new ProverInstrumentation(telemetry, "TestCircuitProver"); 266 | } 267 | 268 | /** 269 | * Simulates the base rollup circuit from its inputs. 270 | * @param input - Inputs to the circuit. 271 | * @returns The public inputs as outputs of the simulation. 272 | */ 273 | public async getBaseRollupProof(input: BaseRollupInputs): Promise> { 274 | // ... 275 | this.instrumentation.recordDuration('simulationDuration', 'base-rollup', timer); 276 | } 277 | } 278 | ``` 279 | -------------------------------------------------------------------------------- /in-progress/7346-batch-proving-circuits-and-l1.md: -------------------------------------------------------------------------------- 1 | | | | 2 | | -------------------- | ------------------------------------------------------------------------------------------------ | 3 | | Issue | [Batch proving in Circuits and L1](https://github.com/AztecProtocol/aztec3-packages/issues/7346) | 4 | | Owners | @spalladino | 5 | | Approvers | @LeilaWang @LHerskind @iAmMichaelConnor | 6 | | Target Approval Date | | 7 | 8 | ## Summary 9 | 10 | With the separation of sequencers and provers, we now want to submit root rollup proofs that encompass multiple blocks. This requires changes in the rollup circuit topology as well as the L1 Rollup contract. 11 | 12 | ## Circuits 13 | 14 | Let's first review the responsibilities of the current merge and root rollup circuits. 15 | 16 | ### Merge Rollup 17 | 18 | The merge rollup circuit has the following inputs: 19 | 20 | ```rust 21 | struct MergeRollupInputs { 22 | previous_rollup_data: [{ 23 | public_inputs: BaseOrMergeRollupPublicInputs 24 | nested_proof: NestedProof, 25 | }; 2] 26 | } 27 | ``` 28 | 29 | The circuit then performs the following checks and computations: 30 | 31 | - Recursively verifies the left and right inputs 32 | - Checks that the tree is greedily filled from left to right 33 | - Checks that constants from left and right match 34 | - Checks that the end state from left matches the start state from right (ie they follow from each other) 35 | - Outputs the start of left and end of right 36 | - Hashes together or sums up any accumulated fields (tx count, effects hashes, accumulated fees, etc) 37 | - Propagates constants 38 | 39 | And outputs: 40 | 41 | ```rust 42 | struct BaseOrMergeRollupPublicInputs { 43 | constants: { 44 | previous_archive: TreeSnapshot, 45 | global_variables: { block_number, timestamp, ... } 46 | }, 47 | start: PartialStateReference, 48 | end: PartialStateReference, 49 | txs_effects_hash: Field, 50 | out_hash: Field, 51 | accumulated_fees: Field 52 | num_txs: Field, 53 | } 54 | ``` 55 | 56 | ### Root rollup 57 | 58 | The root rollup takes the same inputs as merge rollup, plus fields related to L1-to-L2 messaging, and related to updating the archive tree with the new block root. 59 | 60 | ```rust 61 | struct RootRollupInputs { 62 | previous_rollup_data : [PreviousRollupData; 2], 63 | 64 | l1_to_l2_roots: RootRollupParityInput, 65 | l1_to_l2_messages : [Field; NUMBER_OF_L1_L2_MESSAGES_PER_ROLLUP], 66 | l1_to_l2_message_subtree_sibling_path : [Field; L1_TO_L2_MSG_SUBTREE_SIBLING_PATH_LENGTH], 67 | start_l1_to_l2_message_tree_snapshot : AppendOnlyTreeSnapshot, 68 | 69 | start_archive_snapshot : AppendOnlyTreeSnapshot, 70 | new_archive_sibling_path : [Field; ARCHIVE_HEIGHT], 71 | } 72 | ``` 73 | 74 | It performs the same checks as the merge circuit, plus: 75 | 76 | - Creates a new L1-to-L2 tree snapshot 77 | - Creates the new block header, which includes the previous archive tree root 78 | - Updates the archive tree with the new block hash 79 | 80 | ```rust 81 | struct RootRollupPublicInputs { 82 | archive: AppendOnlyTreeSnapshot, 83 | header: { 84 | previous_archive: AppendOnlyTreeSnapshot, 85 | content_commitment: ContentCommitment, 86 | state: StateReference, 87 | global_variables: GlobalVariables, 88 | total_fees: Field 89 | } 90 | } 91 | ``` 92 | 93 | ### New rollup structure 94 | 95 | We propose changing the current rollup structure introducing two new circuits, a **block root rollup** and a **block merge rollup** circuit. The block root rollup circuit acts exactly the same as today's root rollup, grouping multiple base rollup into a tree via merge rollups until it produces a block. The block merge rollup circuits would then merge multiple blocks into a tree, until it reaches a new root rollup that proves a block range. 96 | 97 | The tree levels, from top to bottom, would then be: 98 | 99 | - Root 100 | - Block merge 101 | - Block root 102 | - Merge 103 | - Base 104 | 105 | ### Block root rollup 106 | 107 | The block root rollup circuit is the same as today's root rollup circuit, but with its public inputs tweaked so it matches the public inputs from the block merge rollup as well, in the same way as today the base rollup public inputs are tweaked so they match the ones from the merge rollup. 108 | 109 | ```rust 110 | struct BlockRootOrBlockMergePublicInputs { 111 | previous_archive: AppendOnlyTreeSnapshot, // Archive tree root immediately before this block 112 | new_archive: AppendOnlyTreeSnapshot, // Archive tree root after adding this block 113 | previous_block_hash: Field, // Identifier of the previous block 114 | end_block_hash: Field, // Identifier of the current block 115 | out_hash: Field, // Merkle root of the L2-to-L1 messages in the block 116 | start_global_variables: GlobalVariables, // Global variables for this block 117 | end_global_variables: GlobalVariables, // Global variables for this block 118 | fees: [{ recipient: Address, value: Field }; 32], // Single element equal to global_variables.coinbase and total_fees for the block 119 | } 120 | ``` 121 | 122 | ### Block merge rollup 123 | 124 | The block merge rollup circuit, following the same line of the merge circuit, would take two `BlockRootOrBlockMergePublicInputs` and merge them together: 125 | 126 | ```rust 127 | struct BlockMergeInputs { 128 | previous_rollup_data: [{ 129 | public_inputs: BlockRootOrBlockMergePublicInputs 130 | nested_proof: NestedProof, 131 | }; 2] 132 | } 133 | ``` 134 | 135 | Note that the semantics of the `BlockRootOrBlockMergePublicInputs` are now generalized to a block range: 136 | 137 | ```rust 138 | struct BlockRootOrBlockMergePublicInputs { 139 | previous_archive: AppendOnlyTreeSnapshot, // Archive tree root immediately before this block range 140 | new_archive: AppendOnlyTreeSnapshot, // Archive tree root after adding this block range 141 | out_hash: Field, // Merkle node of the L2-to-L1 messages merkle roots in the block range 142 | previous_block_hash: Field, // Identifier of the previous block before the range 143 | end_block_hash: Field, // Identifier of the last block in the range 144 | start_global_variables: GlobalVariables, // Global variables for the first block in the range 145 | end_global_variables: GlobalVariables, // Global variables for the last block in the range 146 | fees: [{ recipient: Address, value: Field }; 32] // Concatenation of all coinbase and fees for the block range 147 | } 148 | ``` 149 | 150 | This circuit then performs the following checks and computations: 151 | 152 | - Recursively verifies the left and right inputs 153 | - Checks that `right.previous_archive` equals `left.new_archive` 154 | - Checks that `right.previous_block_hash` equals `left.end_block_hash` 155 | - Checks that `right.start_global_variables` follow from `left.end_global_variables` 156 | - Concatenates and outputs the `fees` from both inputs 157 | - Outputs `sha256(left.out_hash, right.out_hash)` as its own `out_hash` 158 | - Outputs `previous_archive`, `start_global_variables`, and `previous_block_hash` from `left` 159 | - Outputs `new_archive`, `end_global_variables`, and `end_block_hash` from `right` 160 | 161 | Note that we say that the global variables in `right` "follow" the ones in `left` if: 162 | 163 | - `left.chain_id == right.chain_id` 164 | - `left.version == right.version` 165 | - `left.block_number + 1 == right.block_number` 166 | - `left.timestamp < right.timestamp` 167 | - `coinbase`, `fee_recipient`, and `gas_fees` are not constrained (though `gas_fees` may be in a 1559-like world) 168 | 169 | ### Root rollup 170 | 171 | The new root rollup circuit then takes two `BlockRootOrBlockMergePublicInputs`, performs the same checks as the block merge rollup, but outputs a subset of the public inputs, to make L1 verification cheaper: 172 | 173 | ```rust 174 | struct RootRollupPublicInputs { 175 | previous_archive: Field, 176 | end_archive: Field, 177 | end_block_hash: Field, 178 | end_timestamp: Field, 179 | end_block_number: Field, 180 | out_hash: Field, 181 | fees: [{ recipient: Address, value: Field }; 32] 182 | } 183 | ``` 184 | 185 | ### Empty block root rollup 186 | 187 | Since we no longer submit a proof per block, and thanks to @MirandaWood we now have wonky rollups, we no longer need to fill a block with "empty" txs. This means we can discard the empty private kernel circuit. 188 | 189 | However, we still need to be able to represent an empty block. We can do this by introducing an empty block root rollup circuit, which outputs a `BlockRootOrBlockMergePublicInputs` which does not consume any merge rollups, and the end state just equals the start state. Note that we may still need an empty nested circuit to fill in the nested proof. 190 | 191 | ## L1 Rollup Contract 192 | 193 | The Rollup contract today has a main entrypoint `process(header, archive, aggregationObject, proof)`. We propose breaking this method into two: 194 | 195 | ```solidity 196 | contract Rollup { 197 | process(header, archive); 198 | submitProof(publicInputs, aggregationObject, proof); 199 | } 200 | ``` 201 | 202 | ### State 203 | 204 | To track both the proven and unproven chains, we add the following state variables to the contract: 205 | 206 | ```diff 207 | contract Rollup { 208 | bytes32 lastArchiveTreeRoot; 209 | uint256 lastBlockTimestamp; 210 | + bytes32 verifiedArchiveTreeRoot; 211 | + uint256 verifiedBlockTimestamp; 212 | } 213 | ``` 214 | 215 | The `last*` fields are updated every time a new block is uploaded, while the `verified*` ones are updated when a proof is uploaded. In the event of a rollback due to failure on proof submission, the `last*` fields are overwritten with the contents of `verified*`. 216 | 217 | ### Process 218 | 219 | Today the `process` method does the following: 220 | 221 | 1. Validate the new block header 222 | 2. Update the `lastArchiveTreeRoot` and `lastBlockTimestamp` in the contract storage 223 | 3. Consume L1-to-L2 messages from the Inbox 224 | 4. Emit an `L2BlockProcessed` event with the block number 225 | 5. Test data availability against the availability oracle 226 | 6. Verify the root rollup proof 227 | 7. Insert L2-to-L1 messages into the Outbox using the `out_hash` and block number 228 | 8. Pay out `total_fees` to `coinbase` in the L1 gas asset 229 | 230 | The first four items can keep being carried out by the `process` method for each new block that gets submitted. Proof verification, L2-to-L1 messages, and fee payment messages are moved to `submitProof`. Data availability needs more clarity based the interaction between the blob circuits and the point evaluation precompile, but it may be moved entirely to `submitProof`. 231 | 232 | As for consuming L1-to-L2 messages, note that these cannot be fully deleted from the Inbox, since we need to be able to rollback unproven blocks in the event of a missing proof, which requires being able to consume those messages again on the new chain. 233 | 234 | Last, we should rename `process` to something more descriptive, such as `submitBlock`. 235 | 236 | ### Submit proof 237 | 238 | This method receives the root rollup public inputs, plus the aggregation object and proof to verify. Its responsibilities are the ones removed from `process`: 239 | 240 | 1. Verify the root rollup proof 241 | 2. Insert L2-to-L1 messages into the Outbox using the `public_inputs.out_hash` and the _last block number in the range_ 242 | 3. Pay out `value` to `recipient` in the L1 gas asset for each pair in the `public_inputs.fees` array 243 | 244 | In addition, this method: 245 | 246 | 1. Checks that the `public_inputs.previous_archive` matches the current `verifiedArchiveTreeRoot` in the L1 contract (ie that the block range proven follows immediately from the previous proven block range). 247 | 2. Proves that the `end_block_hash` is in the `lastArchiveTreeRoot` (ie that the last block in the range proven is actually part of the pending chain) via a Merkle membership proof. 248 | 3. Updates the `verifiedArchiveTreeRoot` and `verifiedBlockTimestamp` fields in the L1 contract. 249 | 4. Emits an `L2ProofVerified` event with the block number. 250 | 251 | ## Discussions 252 | 253 | ### Binary vs fixed-size block merge rollup circuit 254 | 255 | Assuming we implement a proving coordination mechanism where the block ranges proven are of fixed size (eg 32), then we could have a single circuit that directly consumes the 32 block root rollup proofs and outputs the public inputs expected by L1. This could be more efficient than constructing a tree of block root rollup proofs. On the other hand, it is not parallelizable, inflexible if we wanted a dynamic block range size, and must wait until all block root rollup proofs are available to start proving. 256 | 257 | ### Batched vs optimistic vs pull fee payments 258 | 259 | In the model above, fee payments are executed in-batch at the time the proof is submitted. Assuming the L1 payment asset includes a `transferBatch` method, the cost of 32 payments can be brought down to about 160k gas (5000 x 32), which is manageable. However, it imposes a max size on the number of blocks it can be proven on a single batch, and 160k is still a significant number. 260 | 261 | Alternatively, we could keep fee payments as part of the `Rollup.process` method, so that a sequencer is payed the moment they submit a block, even if it doesn't get proven. Depending on how we implement unhappy paths in block building, we could optimistically pay fees on submission, and recoup them as part of a slashing mechanism. This would also simplify the block rollup circuits, as we would not need to construct an array of payments. Either way, it could be a good idea to start with this approach initially as it requires the least engineering effort. 262 | 263 | Another option is to implement pull payments via merkle proofs, where each proof submits not the explicit list of payments but a merkle root of them, which gets stored in the contract. Sequencers then need to submit a merkle membership proof to claim their corresponding fees. This gives us the flexibility of being able to cram as many blocks in a proof as we want to, but adds significant complexity and gas cost to sequencers. 264 | -------------------------------------------------------------------------------- /in-progress/7520-testnet-overview.md: -------------------------------------------------------------------------------- 1 | # [TestNet Design Overview](https://github.com/AztecProtocol/aztec-packages/issues/7520) 2 | 3 | | | | 4 | | -------------------- | -------------------------------- | 5 | | Owners | @just-mitch @LHerskind @Maddiaa0 | 6 | | Approvers | @charlielye @joeandrews | 7 | | Target Approval Date | 2024-08-16 | 8 | 9 | This document is a system design overview of what is to be delivered as the TestNet, focusing on networks, L1 interactions, governance, and economics. 10 | 11 | A fully functional test network is expected to be delivered by Dec 2, 2024. This network will be a publicly available, but with no guarantees of security or stability. 12 | 13 | The deployed network will be referred to as "TestNet". 14 | 15 | Thus, in the immediate term, Aztec Labs and/or the Aztec Foundation (once set up) will be coordinating the operation of the following networks: 16 | 17 | - AlphaNet: a private network for testing and development 18 | - DevNet: a public network for app developers with a centralized sequencer and prover 19 | - Spartan: a public network for infrastructure providers with permissioned sequencers and provers 20 | 21 | By December 16, 2024, these will be consolidated into: 22 | 23 | - AlphaNet: a private network for testing and development 24 | - TestNet: a public network with permissionless sequencers and provers 25 | 26 | The objective of this document is to outline engineering's current understanding of what will be built. 27 | 28 | **Note:** Most of the components below will have their own design documents. 29 | 30 | ## Overview 31 | 32 | The Aztec Network is a privacy-focused, general-purpose Layer 2 network built on Ethereum. It will use zero-knowledge client-side proofs to enable private, programmable transactions, a VM to enable verified public computation, and a rollup architecture to scale. Aztec will be designed to be permissionless and decentralized, while maintaining sound economics, governance, as well as turing complete functionality for application developers to build in any applicable compliance requirements. 33 | 34 | ## L1 35 | 36 | L1 is Ethereum Sepolia. 37 | 38 | ## Network L1 Deployments 39 | 40 | A deployment of the Aztec Network includes several contracts running on L1. 41 | 42 | ### TST Contract 43 | 44 | TST will be an ERC20 asset that will be used to pay for transaction fees on the Aztec Network. 45 | 46 | It will also used on L1 as part of the validator selection process. 47 | 48 | Protocol incentives are paid out in TST. 49 | 50 | A canonical bridge will allow bridging TST from L1 to L2. 51 | 52 | TST bridged through the canonical bridge is exclusively used to pay transaction fees; it cannot be transferred to other users on L2. 53 | 54 | The TST contract will be immutable. 55 | 56 | ### Incentives Contract 57 | 58 | The Incentives contract will be responsible for minting TST. 59 | 60 | Only the owner of the Incentives contract can mint TST. 61 | 62 | It has a rate limiter on minting. 63 | 64 | The Incentives contract will be immutable. 65 | 66 | ### Governance Contract 67 | 68 | The Governance Contract owns the Incentives contract. 69 | 70 | TST holders can lock their TST in the Governance Contract to vote on proposals. 71 | 72 | Proposals can only be submitted by the PendingProposals Contract. 73 | 74 | Proposals must garner X% of the total locked TST to be ratified. 75 | 76 | There will be a time delay between ratification and execution. 77 | 78 | ### Registry Contract 79 | 80 | The Registry Contract will be able to keep track of the current/canonical and historical Instances. 81 | 82 | An Instance will be comprised of: 83 | - A Rollup Contract, which is the main contract that handles the rollup of transactions. 84 | - A Data Availability Oracle, which is responsible for answering if the preimage of commitments have been made available. 85 | - An Inbox, responsible for receiving messages from L1 and making them available L2. 86 | - An Outbox, responsible for receiving messages from L2 and making them available on L1. 87 | 88 | ### Rollup Contract 89 | 90 | The initial Rollup Contract will require holders of TST to stake to become validators. 91 | 92 | The initial Rollup Contract will maintain a balance of TST to be used for rewards. 93 | 94 | ### PendingProposals Contract 95 | 96 | The Proposals Contract keeps track of governance proposals and votes. 97 | 98 | It watches for proposal signals in the Registry's canonical Instance. 99 | 100 | When M of the previous N blocks contain the same proposal, it is submitted to Governance. 101 | 102 | 103 | ### Open Questions 104 | 105 | - How does it work? 106 | - Can this mechanism be tied to the based sequencing mechanism? 107 | 108 | ## Aztec Node 109 | 110 | Aztec Labs will provide a reference implementation of the Aztec Node, which will be used to run the Aztec Network. 111 | 112 | It will have 3 primary modes of operation: 113 | 114 | - Proposer/Validator: responsible for proposing new blocks and validating them 115 | - Prover: responsible for orchestrating the creation of various proofs 116 | - Full Node: follows along, responsible for propagating transactions and blocks 117 | 118 | The Aztec Node will have a web interface for monitoring key metrics. 119 | 120 | ## Chains, slots, and epochs 121 | 122 | There will be three chains in the Aztec Network: 123 | 124 | - The Pending Chain 125 | - The Proven Chain 126 | - The Finalized Chain 127 | 128 | All three chains will be effectively managed by the Aztec Node and the L1 contracts, and will have different guarantees. 129 | 130 | Time will be divided into slots, which will be grouped into epochs. 131 | 132 | Each slot will have a proposer, who is responsible for proposing a block of transactions. 133 | 134 | Each epoch will have a set of validators, who add economic security to the Pending Chain by providing signatures on proposed blocks; ultimately the Pending Chain is a UX feature that allows users to see their transactions with _some guarantee_ before they are proven. 135 | 136 | ### Open Questions 137 | 138 | - How long should a slot be? 139 | - How long should an epoch be? 140 | 141 | ## The Pending Chain 142 | 143 | The purpose of the pending chain is to reduce the perceived latency of transactions: it allows clients to observe transactions that have been proposed, but the proof has not yet been made available. 144 | 145 | The proposer for a slot produces a list of transaction objects either by 146 | 147 | - selecting them from the L2 mempool, 148 | - or receiving a list from a builder. 149 | 150 | The proposer gossips to the validators: 151 | 152 | - A signature showing it is the current proposer 153 | - The list of transaction objects 154 | 155 | Validators: 156 | - check that the proposer is the current proposer 157 | - verify the private kernel proofs of the transaction objects 158 | - create a signature over the list of transaction objects. 159 | 160 | Once the proposer has collected enough signatures, it submits the signatures data to a function on the rollup contract dedicated to advancing the pending chain. 161 | 162 | The next proposer will watch L1 for updates to the pending chain, execute the constituent transactions (possibly getting them from a peer) and produce the implied L2 header of the **previous/published** block *before* it then selects the TxObjects that will form its block. 163 | 164 | 165 | ### Open Questions 166 | 167 | - Do we need to do optimistic signature verification? 168 | - How many signatures are required? 169 | - How will the proposer of the invalid transaction be charged? 170 | 171 | ## The Proven Chain 172 | 173 | The purpose of the proven chain is to verify a zero-knowledge proof that attests to the correctness of the transactions in the pending chain. 174 | 175 | It is a prefix of the pending chain. 176 | 177 | The proposer named in the first slot in an epoch has monopoly rights on proving the previous epoch. 178 | 179 | The proof of epoch `i` must be submitted within a certain number of L1 blocks after the end of epoch `i`. 180 | 181 | If this does not happen, there is an "open challenge period" where anyone can submit a proof of the epoch. 182 | 183 | If no proof is submitted the epoch is considered invalid; the pending chain is rolled back to the last proven epoch. 184 | 185 | The proposers must coordinate payment and proving out of protocol. 186 | 187 | Some users may coordinate with prover marketplaces, but the Aztec Node will come with the ability to "self-prove" an epoch. 188 | 189 | ### Open Questions 190 | 191 | - Is a prover commitment bond needed in-protocol? 192 | - How do proving marketplaces integrate? 193 | - What is the timeliness requirement for the proof submission? 194 | - What is the open challenge period? 195 | - Under what conditions can the pending chain be rolled back? 196 | - Is "steal your funds" ever possible? 197 | 198 | ## Based Sequencing 199 | 200 | As a safety mechanism, all deployed instances will support a "based" sequencing mode that allows blocks to be added to the pending/proven chain without the need for L2 validators. 201 | 202 | ### Open Questions 203 | 204 | - How does it work? 205 | - What are the circumstances for using based sequencing? 206 | 207 | ## The Finalized Chain 208 | 209 | The purpose of the finalized chain is to provide a final, immutable (up to Casper FFG) record of the state of the Aztec Network. 210 | 211 | It is a prefix of the proven chain, and blocks naturally move from the proven chain to the finalized chain as proofs become finalized in the eyes of L1. 212 | 213 | ## Prover Nodes 214 | 215 | Prover nodes will receive information from proposers and will be responsible for creating proofs, and posting them to L1. 216 | 217 | ### Open Questions 218 | 219 | - What is the interface that proposers will use to communicate with prover nodes? 220 | - Do they need one other than L1? 221 | 222 | ## Proposer/Validator Selection 223 | 224 | As noted above, the initial Rollup contract will allow holders of TST to stake a set amount to become validators. 225 | 226 | One user can have multiple validators. 227 | 228 | A randao will be used to select a committee from the validator set for each epoch. 229 | 230 | Each slot in an epoch will be randomly assigned to a validator in the committee. 231 | 232 | ### Open Questions 233 | 234 | - How should rewards be distributed? 235 | - What are the slashing conditions? 236 | - Probability/severity/cost of different attacks based on the power of attacker (1%, 5%, 10%, 20%, 33%, 50%, 67% of stake) 237 | - Time to detect and react to a safety breach? 238 | - What is the marginal cost/benefit of an extra validator in the set/committee? 239 | 240 | ## Fees 241 | 242 | Every transaction in the Aztec Network has a fee associated with it. The fee is payed in TST which has been bridged to L2. 243 | 244 | Transactions consume gas. There are two types of gas: 245 | 246 | - L2 gas: the cost of computation 247 | - DA gas: the cost of publishing/storing data 248 | 249 | When a user specifies a transaction, they provide values: 250 | 251 | - maxFeePerL2Gas: the maximum fee they are willing to pay in TST per unit L2 gas 252 | - maxFeePerDAGas: the maximum fee they are willing to pay in TST per unit DA gas 253 | - l2GasLimit: the maximum amount of L2 gas they are willing to consume 254 | - daGasLimit: the maximum amount of DA gas they are willing to consume 255 | 256 | Thus, the maximum fee they are willing to pay is: 257 | 258 | - maxFee = maxFeePerL2Gas * l2GasLimit + maxFeePerDAGas * daGasLimit 259 | 260 | There is an additional pair of parameters to support complex flow such as fee abstraction: 261 | 262 | - l2TeardownGasLimit: the maximum amount of L2 gas they are willing to consume for the teardown of the transaction 263 | - daTeardownGasLimit: the maximum amount of DA gas they are willing to consume for the teardown of the transaction 264 | 265 | Both of these values are used to "pre-pay" for the public teardown phase of the transaction. 266 | 267 | Each L2 block has a fixed L2 gas limit and a DA gas limit. 268 | 269 | There will be an in-protocol mechanism for updating the `feePerL2Gas` and `feePerDAGas` values. 270 | 271 | ### Open Questions 272 | 273 | - How will the user figure out a fitting value for `maxFeePerL2Gas` and `maxFeePerDAGas` 274 | - How can it be ensured that the cost of proving is covered by L2 gas? 275 | - How will L1 figure out fitting values for `feePerL2Gas` and `feePerDAGas`, such that costs are correctly passed back to the users? 276 | 277 | 278 | ## Proven Block Rewards 279 | 280 | The protocol may have rewards for proven blocks. 281 | These will be in addition to the transaction fees paid by users. 282 | 283 | ### Open Questions 284 | 285 | - How much should proven blocks be subsidized? 286 | - How much should the protocol retain for future development? 287 | - Historical proposers and validators should be rewarded. How can this be done in a way that is not extremely costly? 288 | 289 | ## Transaction Lifecycle 290 | 291 | The executable code of a transactions follows the following lifecycle: 292 | 293 | 1. Locally, in private: 294 | 1. Setup 295 | 2. App Logic 296 | 2. On L2, in public: 297 | 1. Setup 298 | 2. App Logic 299 | 3. Teardown 300 | 301 | If the private portion fails, the transaction is not submitted to L2. 302 | 303 | If the public portion fails in the setup phase, the transaction is invalid, and discarded. 304 | 305 | If the public portion fails in the app logic or teardown phase the side effects from the failing stage are discarded but the transaction is still valid. Users can simulate their transactions ahead of time and not submit them if they fail. 306 | 307 | ### Open Questions 308 | 309 | - How painful is it for sequencers to whitelist public setup code? 310 | 311 | ## Data Availability 312 | 313 | Ethereum blobs will be used to publish TxObjects and proofs. 314 | 315 | ## Penalties and Slashing 316 | 317 | There will be penalties for proposers and provers who do not fulfill their duties. 318 | 319 | ### Open Questions 320 | 321 | - Under what conditions should actors be slashed? 322 | - committee members 323 | - proposers 324 | - provers 325 | - What is require to convince L1 that the conditions are met? 326 | - What is the "cost" of an enforcement action? e.g., if tiny penalty it might not be worth to enforce it. 327 | - What are the penalties for proposers and provers? 328 | - How can it be ensured that the penalties are fair? 329 | - What should be burned, and what should be distributed? 330 | - Expected annual return for validators (mean, median)? 331 | 332 | ## Disclaimer 333 | 334 | The information set out herein is for discussion purposes only and does not represent any binding indication or commitment by Aztec Labs and its employees to take any action whatsoever, including relating to the structure and/or any potential operation of the Aztec protocol or the protocol roadmap. In particular: (i) nothing in these projects, requests, or comments is intended to create any contractual or other form of legal relationship with Aztec Labs or third parties who engage with this AztecProtocol GitHub account (including, without limitation, by responding to a conversation or submitting comments) (ii) by engaging with any conversation or request, the relevant persons are consenting to Aztec Labs’ use and publication of such engagement and related information on an open-source basis (and agree that Aztec Labs will not treat such engagement and related information as confidential), and (iii) Aztec Labs is not under any duty to consider any or all engagements, and that consideration of such engagements and any decision to award grants or other rewards for any such engagement is entirely at Aztec Labs’ sole discretion. Please do not rely on any information on this account for any purpose - the development, release, and timing of any products, features, or functionality remains subject to change and is currently entirely hypothetical. Nothing on this account should be treated as an offer to sell any security or any other asset by Aztec Labs or its affiliates, and you should not rely on any content or comments for advice of any kind, including legal, investment, financial, tax, or other professional advice. 335 | -------------------------------------------------------------------------------- /in-progress/7588-spartan-clusters.md: -------------------------------------------------------------------------------- 1 | # Configurable k8s clusters for testing and deploying networks 2 | 3 | | | | 4 | | -------------------- | ----------------------------------------------------------- | 5 | | Issue | https://github.com/AztecProtocol/aztec-packages/issues/7588 | 6 | | Owners | @just-mitch | 7 | | Approvers | @PhilWindle @alexghr @charlielye | 8 | | Target Approval Date | 2024-08-16 | 9 | 10 | 11 | ## Executive Summary 12 | 13 | We will: 14 | - add a helm chart for deploying a configurable network. 15 | - add support for ad-hoc k8s clusters in CI and running e2e tests on them 16 | - add support for deploying a public network 17 | 18 | 19 | ## Introduction 20 | 21 | To properly test a decentralized system, we need the ability to spin up networks with different topologies such as the number of nodes, validators, provers, pxes. 22 | 23 | Additionally, we need to test under simulated stress/attack conditions. 24 | 25 | Further, we need to be able to deploy these networks in a repeatable and automated way. 26 | 27 | > Kubernetes, also known as K8s, is an open source system for automating deployment, scaling, and management of containerized applications. 28 | 29 | And [Helm](https://helm.sh/) is "the package manager for Kubernetes". 30 | 31 | This allows us to define a network configuration in a helm chart and deploy it to a k8s cluster, setting variables on the fly such as the number of nodes, validators, machine resources, etc. 32 | 33 | K8s is also easy to use in CI via [kind](https://kind.sigs.k8s.io/). 34 | 35 | > kind is a tool for running local Kubernetes clusters using Docker container “nodes”. 36 | > kind was primarily designed for testing Kubernetes itself, but may be used for local development or CI. 37 | 38 | Further, we can use [chaos mesh](https://chaos-mesh.org/) to simulate network conditions such as node failures, latency, etc. 39 | 40 | Last, we can likely use the exact same chart to deploy a public network. 41 | 42 | ### Why not docker compose? 43 | 44 | We use docker-compose already in some tests. The problem is that it is very difficult to test a network with more than a few nodes. It is also difficult to simulate network conditions. Last, we wouldn't be able to use the same tooling to deploy a public network. 45 | 46 | The thinking here is to use the same tooling that we use for production deployments to test our networks. This should result in less code and more confidence. 47 | 48 | ## Interface 49 | 50 | The users of this work are developers of aztec and possibly infrastructure providers. 51 | 52 | There will be a new top-level folder `helm-charts`. In it, there will be a `aztec-network` chart. 53 | 54 | A lot of the value is shown in the helm chart's values.yaml file. The values in this file are used to populate the templates that make up the rest of the chart. Here is an example: 55 | 56 | ```yaml 57 | images: 58 | aztec: 59 | image: aztecprotocol/aztec:c07d910d0b8b83b62008e79a7085db6c5020df4e 60 | 61 | bootNode: 62 | replicas: 1 63 | service: 64 | p2pPort: 40400 65 | nodePort: 8080 66 | logLevel: 'debug' 67 | debug: 'discv5:*,aztec:*' 68 | fakeProofs: false 69 | resources: 70 | requests: 71 | cpu: 100m 72 | memory: 128Gi 73 | limits: 74 | cpu: 100m 75 | memory: 128Gi 76 | 77 | 78 | validator: 79 | replicas: 32 80 | service: 81 | p2pPort: 40400 82 | nodePort: 8080 83 | logLevel: 'debug' 84 | debug: 'discv5:*,aztec:*' 85 | fakeProofs: false 86 | resources: {} 87 | 88 | pxe: 89 | replicas: 3 90 | service: 91 | type: ClusterIP 92 | port: 8080 93 | targetPort: 8080 94 | fakeProofs: false 95 | resources: {} 96 | 97 | chaos: 98 | scenarios: 99 | - name: 'boot-node-failure' 100 | type: 'pod' 101 | selector: 102 | matchLabels: 103 | app: boot-node 104 | action: 'kill' 105 | actionOptions: 106 | force: true 107 | ``` 108 | 109 | The `yarn-projects/+end-to-end-base` earthly image will be updated to include `kubectl`, `helm`, and `kind`. 110 | 111 | We will add new tests to `yarn-projects/end-to-end`, and expose them to CI as earthly targets in the form `yarn-projects/end-to-end/+network-{test_name}`. 112 | 113 | The `ci` github action workflow will have a new job analogous to `e2e` and `bench-e2e` called `network-e2e`. This job will be a matrix on the `+network` targets, and for each it will ensure the test machine has a k8s cluster running with the relevant docker images loaded (via kind), and run a particular earthly target. 114 | 115 | Each target will: 116 | 1. deploy a network to the k8s cluster (via helm install) 117 | 2. (optional) apply some chaos configurations 118 | 3. run helm tests 119 | 4. tear down the network 120 | 121 | 122 | ## Implementation 123 | 124 | ### Helm Chart 125 | 126 | **anvil** 127 | 128 | There will be a deployment for anvil. It will have a single replica, and be exposed via ClusterIP. 129 | 130 | **boot node** 131 | 132 | There will be a statefulset with a single replica for the boot node. 133 | 134 | From the aztec node perspective, it will be started as `start --node --archiver`. 135 | 136 | As part of its init container it will deploy the enshrined L1/L2 contracts. Other nodes in the network will be able to resolve the boot node's address via its stable DNS name, e.g. `boot-node-0.aztec-network.svc.cluster.local`. 137 | 138 | **full node** 139 | 140 | There will be a statefulset for the full nodes, i.e. `start --node --archiver` 141 | 142 | The number of replicas will be configurable. Each full node will have a service exposing its p2p port and node port. 143 | 144 | As part of their init container, they will get config from the boot node, including its ENR (which will require exposing this on the `get-node-info` endpoint). 145 | 146 | It will be possible to address full nodes individually via their stable DNS name, e.g. `full-node-0.aztec-network.svc.cluster.local`, as well as collectively via a service, e.g. `full-node.aztec-network.svc.cluster.local`. 147 | 148 | **validator** 149 | 150 | Similar configuration as full nodes, but with a different service name, and started via `start --node --archiver --sequencer`. 151 | 152 | The number of replicas will be configurable. 153 | 154 | Tests will add/remove validators to/from the L1 validator set. 155 | 156 | **prover node** 157 | 158 | Same configuration as full nodes, but with a different service name, and started via `start --node --archiver --prover-node`. 159 | 160 | The number of replicas will be configurable. 161 | 162 | **prover agent** 163 | 164 | There will be a deployment for prover agents, and started as `start --prover`. 165 | 166 | The number of replicas will be configurable. 167 | 168 | **pxe** 169 | 170 | PXEs will be deployed as a statefulset. 171 | 172 | It will be started as `start --pxe`. 173 | 174 | The number of replicas will be configurable. Each PXE will have a service exposing its port. 175 | 176 | PXEs will use the collective full node service as their node url by default. 177 | 178 | PXEs will only be able to be addressed individually. 179 | 180 | **opentel** 181 | 182 | There will be a deployment for opentel. It will have a single replica. Nodes in the network will push their logs to opentel. 183 | 184 | **prometheus** 185 | 186 | There will be a deployment for prometheus. It will have a single replica. 187 | 188 | **grafana** 189 | 190 | There will be a deployment for graphana. It will have a single replica, and be exposed via ClusterIP. 191 | 192 | 193 | ### Staging Network 194 | 195 | We will create a long-lived k8s cluster deployed on AWS. This cluster will be used for running the `staging` network. 196 | 197 | We will use this network for long-running stress tests. 198 | 199 | There will be a github action workflow that deploys the network to this cluster on every push to the `staging` branch. 200 | 201 | The grafana dashboard will be exposed via a public IP, but password protected. 202 | 203 | ### Production Network 204 | 205 | There will be a separate long-lived k8s cluster deployed on AWS. This cluster will be used for running the public `spartan` network. 206 | 207 | There will be a github action workflow that deploys the network to this cluster on every push to the `spartan` branch. 208 | 209 | When `spartan` is deployed, it will deploy the boot node (pointed at Sepolia) and expose a service bound to a static IP for accessing it. There won't be any full nodes or PXEs deployed by default. The grafana dashboard will be exposed via a public IP, but password protected. 210 | 211 | ### Chaos 212 | 213 | We will add a `chaos` chart within `helm-charts`. 214 | 215 | This will have templates in it for enabling chaos, e.g.: 216 | 217 | ```yaml 218 | apiVersion: chaos-mesh.org/v1alpha1 219 | kind: PodChaos 220 | metadata: 221 | name: pod-failure-example 222 | namespace: testing-network-namespace 223 | spec: 224 | action: pod-failure 225 | mode: one 226 | duration: '30s' 227 | selector: 228 | labelSelectors: 229 | 'app.kubernetes.io/component': 'boot-node' 230 | ``` 231 | 232 | In reality those selectors and durations will be configurable via the `values.yaml` file. 233 | 234 | ### +network tests 235 | 236 | The network tests will use an analogous helper to the `E2E_TEST` in `yarn-projects/end-to-end/Earthfile`. 237 | 238 | ```dockerfile 239 | NETWORK_TEST: 240 | FUNCTION 241 | ARG hardware_concurrency="" 242 | ARG namespace 243 | ARG test 244 | ARG network_values 245 | ARG chaos_values 246 | LOCALLY 247 | # Let docker compose know about the pushed tags above 248 | ENV AZTEC_DOCKER_TAG=$(git rev-parse HEAD) 249 | # load the docker image into kind 250 | RUN kind load docker-image aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG 251 | RUN helm install aztec-network helm-charts/aztec-network --set $network_values --namespace $namespace 252 | RUN helm install aztec-chaos helm-charts/aztec-chaos --set $chaos_values --namespace $namespace 253 | RUN helm test aztec-network --namespace $namespace 254 | ``` 255 | 256 | 257 | ## Change Set 258 | 259 | Fill in bullets for each area that will be affected by this change. 260 | 261 | - [ ] Cryptography 262 | - [ ] Noir 263 | - [ ] Aztec.js 264 | - [x] PXE 265 | - [ ] Aztec.nr 266 | - [ ] Enshrined L2 Contracts 267 | - [ ] Private Kernel Circuits 268 | - [x] Sequencer 269 | - [ ] AVM 270 | - [ ] Public Kernel Circuits 271 | - [ ] Rollup Circuits 272 | - [ ] L1 Contracts 273 | - [ ] Prover 274 | - [ ] Economics 275 | - [ ] P2P Network 276 | - [x] DevOps 277 | 278 | ## Test Plan 279 | 280 | ### Asset Transfer Matrix 281 | 282 | We will build out a matrix of conditions to test asset transfers. This will include: 283 | 284 | - different network topologies (number of nodes, validators, pxes) 285 | - different network conditions (latency, node failures) 286 | 287 | Specifically, we will ensure that: 288 | 289 | - A block is proposed and added to the Pending Chain 290 | - A block is proven and added to the Proven Chain 291 | - A block is finalized 292 | - The network can tolerate a sequencer going offline 293 | - The network can tolerate a prover going offline 294 | - The network can tolerate a sequencer submitting an invalid block 295 | - The network can tolerate sequencers/validators with slow connections 296 | 297 | ### Attack Scenarios 298 | 299 | We will verify: 300 | - A block proposed without a signature from the current proposer should fail 301 | - A prover submitting a proof with an invalid proof 302 | - A coalition of dishonest sequencers submitting a block with an invalid state transition 303 | - We can tolerate "soft" L1 censorship (i.e. the anvil instance is down for a period of time) 304 | 305 | ### Existing e2e tests 306 | 307 | Existing e2e tests will continue to work as they do now. 308 | 309 | We will gradually port the tests to work with either the existing setup or the new setup, configurable via an environment variable; this likely can be done by simply pointing the wallets that run in the tests to PXEs in the k8s cluster. 310 | 311 | ## Documentation Plan 312 | 313 | We will write documentation on how people can join the `spartan` network. 314 | 315 | ## Timeline 316 | 317 | - [ ] 2024-08-16: Target Approval Date 318 | - [ ] 2024-08-21: Small network passing asset transfer in CI. No chaos. 319 | - [ ] 2024-08-28: Large network passing asset transfer in CI with chaos. 320 | - [ ] 2024-09-04: Attack scenarios passing in CI. 321 | - [ ] 2024-09-09: `staging` network deployed 322 | - [ ] 2024-09-11: `spartan` network deployed 323 | 324 | ## Future Work 325 | 326 | There is also [attack net](https://github.com/crytic/attacknet) that works with k8s and chaos-mesh and is purpose built for testing blockchains in adversarial conditions. 327 | 328 | This will be useful to add to the network tests, especially around L1 censorship resistance. 329 | 330 | -------------------------------------------------------------------------------- /in-progress/8077-request-response/design.md: -------------------------------------------------------------------------------- 1 | # Request/Response for Transactions 2 | 3 | | | | 4 | | -------------------- | -------------------------------------------------------------------------------- | 5 | | Issue | [feat(vc): ReqResp](https://github.com/AztecProtocol/aztec-packages/issues/8077) | 6 | | Owners | @Maddiaa0 | 7 | | Approvers | @just-mitch | 8 | | Target Approval Date | 2024-09-17 | 9 | 10 | 11 | ## Executive Summary 12 | 13 | Validator nodes need a mechanism to fetch transactions from peers that they have been asked to sign as part of a proposed block. This design proposes a request/response mechanism for fetching transactions from peers. 14 | 15 | **NOTE:**, this has been largely copied from https://github.com/AztecProtocol/engineering-designs/pull/19/files, which was closed in favor of breaking that design into smaller pieces; credit should go to @Maddiaa0 for the design presented here. 16 | 17 | ## P2P Layer Description 18 | 19 | Validators are expected to validate all private transaction proofs before signing an attestation to a block proposal. A block proposal will contain (at LEAST) a list of all of the transaction hashes that are to be included within the block. At an extension, and provided the p2p layer can handle large message payloads, the entire transaction payload - not just the transaction hash - may be included within the block proposal. 20 | 21 | The happy path for the p2p layer is that gossiped transactions are broadcast to all peers, and then further propagated throughout the network. If only the transaction hashes are included in the block proposal, there is a possibility that a node may not have the full transaction payload for a proposed block. In the case that a node does not have the full transaction payload for a proposed block, the node should request the full transaction payload from a peer. This will be performed by a request response protocol between nodes. 22 | 23 | 24 | ## Implementation 25 | 26 | ### Protocol Identification 27 | Each request response interaction is identified by a protocolID (a.k.a. SubProtocol). 28 | This protocol will take a form similar to the `gossipSub` protocol topic, the form is expected to be as follows. 29 | 30 | ``` 31 | /aztec/req/{subprotocol}/{version} 32 | ``` 33 | 34 | Where subprotocol details the type of request being made, and version details the version of the protocol being used, the exact form of version is yet to be determined. It may represent a semantic version (such as a hardfork), or it may represent a specific protocol implementation version (semver identifier). 35 | 36 | Whatever version standard is used will match that used by the `gossipSub` protocol topics. 37 | 38 | ### Request / Response Interactions 39 | When a validator requires a payload from a peer, they will dial their peer's, opening up ONE stream per interaction. This stream will be closed when this interaction is complete, regardless of whether the interaction was successful or not. 40 | 41 | ### Request / Response Interfaces 42 | 43 | Each request response interaction is explicitly defined by the protocolID (a.k.a. SubProtocol). This protocolID can be used to determine the expected message request types, and the expected data response types. 44 | 45 | *Request / Response Mapping* 46 | 47 | ```ts 48 | /** 49 | * The Request Response Pair interface defines the methods that each 50 | * request response pair must implement 51 | */ 52 | interface RequestResponsePair { 53 | request: new (...args: any[]) => Req; 54 | /** 55 | * The response must implement the static fromBuffer method (generic serialisation) 56 | */ 57 | response: { 58 | new (...args: any[]): Res; 59 | fromBuffer(buffer: Buffer): Res; 60 | }; 61 | } 62 | ``` 63 | 64 | For an example transaction request protocol, the serialized request and response types are defined as follows: 65 | - The request will send a TransactionHash, which means the responder should expect strict serialization of the TransactionHash type. 66 | - The response will send a Transaction, which means the responder should expect strict serialization of the Transaction type. 67 | 68 | ```ts 69 | '/aztec/req/tx/0.1.0': { 70 | request: TxHash, 71 | response: Tx, 72 | }, 73 | ``` 74 | 75 | ### Making a request 76 | When opening a stream with a peer, a specific protocolID (a.k.a. SubProtocol) will be used to determine what data the peer requires. For example, a SubProtocol with id `/aztec/req/tx/0.1.0` represents a stream that will be used to request a transaction payload from a peer. 77 | 78 | If the requesting node does not receive a response from the peer, it will wait for a maximum `REQUEST_TIMEOUT` before the stream is closed and the node attempts to open a new stream with a different peer. 79 | 80 | When a peer makes a request via the stream, their side of the stream will be closed upon sending the request, and no further messages will be send to the peer. 81 | 82 | The requester will read from the stream until: 83 | 1. An error is returned. 84 | 2. The stream is closed by the responding peer. 85 | 3. Any error is processed when interpreting the response. 86 | 4. A maximum number of chunks are read (e.g. the expected size of the payload to be received). 87 | 5. A timeout is reached. 88 | 89 | The response to the message should be processed fully before the stream is closed. 90 | 91 | ### Making a response 92 | Both the requester and responder will negotiate a protocolID (a.k.a. SubProtocol) to be used for the stream. 93 | 94 | Based on the negotiated protocolID, the responder will know the message length to expect from the requester. 95 | It should fail if, the message received is not of the expected size. 96 | The message cannot be deserialized into the expected type. 97 | 98 | The responder will read from the stream until: 99 | 100 | 1. An error is returned. 101 | 2. The stream is closed by the requesting peer. 102 | 3. Any error is processed when interpreting the request. 103 | 4. A maximum number of chunks are read (e.g. the expected size of the payload to be received). 104 | 5. A timeout is reached. 105 | 106 | At the time of writing, erroneous responses are represented by an empty response type. In the future, it may be beneficial to include additional information about the error, such as an error code and/or message. 107 | 108 | > We do not include any information below the libp2p level in the protocol, messaging protocol etc are not defined here. 109 | 110 | 111 | #### An overview of p2p layer flows. 112 | 113 | ![When the request response protocol is used](./p2p-layer-overview.png) 114 | 115 | 116 | ## Change Set 117 | 118 | Fill in bullets for each area that will be affected by this change. 119 | 120 | - [ ] Cryptography 121 | - [ ] Noir 122 | - [ ] Aztec.js 123 | - [ ] PXE 124 | - [ ] Aztec.nr 125 | - [ ] Enshrined L2 Contracts 126 | - [ ] Private Kernel Circuits 127 | - [ ] Sequencer 128 | - [ ] AVM 129 | - [ ] Public Kernel Circuits 130 | - [ ] Rollup Circuits 131 | - [ ] L1 Contracts 132 | - [ ] Prover 133 | - [ ] Economics 134 | - [ ] P2P Network 135 | - [ ] DevOps 136 | 137 | ## Test Plan 138 | 139 | Outline what unit and e2e tests will be written. Describe the logic they cover and any mock objects used. 140 | 141 | ## Documentation Plan 142 | 143 | Identify changes or additions to the user documentation or protocol spec. 144 | 145 | 146 | ## Rejection Reason 147 | 148 | If the design is rejected, include a brief explanation of why. 149 | 150 | ## Abandonment Reason 151 | 152 | If the design is abandoned mid-implementation, include a brief explanation of why. 153 | 154 | ## Implementation Deviations 155 | 156 | If the design is implemented, include a brief explanation of deviations to the original design. 157 | 158 | ## Disclaimer 159 | 160 | The information set out herein is for discussion purposes only and does not represent any binding indication or commitment by Aztec Labs and its employees to take any action whatsoever, including relating to the structure and/or any potential operation of the Aztec protocol or the protocol roadmap. In particular: (i) nothing in these projects, requests, or comments is intended to create any contractual or other form of legal relationship with Aztec Labs or third parties who engage with this AztecProtocol GitHub account (including, without limitation, by responding to a conversation or submitting comments) (ii) by engaging with any conversation or request, the relevant persons are consenting to Aztec Labs’ use and publication of such engagement and related information on an open-source basis (and agree that Aztec Labs will not treat such engagement and related information as confidential), and (iii) Aztec Labs is not under any duty to consider any or all engagements, and that consideration of such engagements and any decision to award grants or other rewards for any such engagement is entirely at Aztec Labs’ sole discretion. Please do not rely on any information on this account for any purpose - the development, release, and timing of any products, features, or functionality remains subject to change and is currently entirely hypothetical. Nothing on this account should be treated as an offer to sell any security or any other asset by Aztec Labs or its affiliates, and you should not rely on any content or comments for advice of any kind, including legal, investment, financial, tax, or other professional advice. 161 | -------------------------------------------------------------------------------- /in-progress/8077-request-response/p2p-layer-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/8077-request-response/p2p-layer-overview.png -------------------------------------------------------------------------------- /in-progress/8401-proof-timeliness/proof-timeliness.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from scipy.stats import hypergeom" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "def calculate_probability(V, C):\n", 19 | " \"\"\"\n", 20 | " Calculate the probability of drawing more than 2/3 malicious validators\n", 21 | " in a committee of size C from a total validator set of size V.\n", 22 | "\n", 23 | " :param V: Total number of validators\n", 24 | " :param C: Committee size\n", 25 | " :return: Probability\n", 26 | " \"\"\"\n", 27 | " malicious = V // 3 # Number of malicious validators (1/3 of total)\n", 28 | " threshold = int(2 * C // 3) # Threshold for malicious majority\n", 29 | "\n", 30 | " # Calculate the probability of having more than threshold malicious validators\n", 31 | " prob = 1 - hypergeom.cdf(threshold, V, malicious, C)\n", 32 | "\n", 33 | " return prob\n", 34 | "\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "def calculate_first_p_malicious(V, C, P):\n", 44 | " \"\"\"\n", 45 | " Calculate the probability that the first P validators in a randomly ordered\n", 46 | " committee of size C (drawn from a total set of V validators) are all malicious.\n", 47 | "\n", 48 | " :param V: Total number of validators\n", 49 | " :param C: Committee size\n", 50 | " :param P: Number of first validators to check\n", 51 | " :return: Probability\n", 52 | " \"\"\"\n", 53 | " malicious = V // 3 # Number of malicious validators (1/3 of total)\n", 54 | "\n", 55 | " # Probability of exactly k malicious validators in the committee\n", 56 | " prob_k_malicious = lambda k: hypergeom.pmf(k, V, malicious, C)\n", 57 | "\n", 58 | " total_prob = 0\n", 59 | " for k in range(P, C + 1): # k cannot be less than P\n", 60 | " prob_committee = prob_k_malicious(k)\n", 61 | " prob_first_p_malicious = 1\n", 62 | " for i in range(P):\n", 63 | " prob_first_p_malicious *= (k - i) / (C - i)\n", 64 | " total_prob += prob_committee * prob_first_p_malicious\n", 65 | "\n", 66 | " return total_prob" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "def find_smallest_C(V, threshold=1e-6):\n", 76 | " \"\"\"\n", 77 | " Find the smallest C such that the probability of more than 2/3 of the\n", 78 | " committee being malicious is less than the threshold.\n", 79 | "\n", 80 | " :param V: Total number of validators\n", 81 | " :param threshold: Probability threshold\n", 82 | " :return: Smallest C meeting the criteria\n", 83 | " \"\"\"\n", 84 | " C = 1\n", 85 | " while calculate_probability(V, C) >= threshold:\n", 86 | " C += 1\n", 87 | " return C\n", 88 | "\n", 89 | "\n", 90 | "def find_smallest_P(V, C, threshold=1e-6):\n", 91 | " \"\"\"\n", 92 | " Find the smallest P such that the probability of the first P validators\n", 93 | " in a row being malicious is less than the threshold.\n", 94 | "\n", 95 | " :param V: Total number of validators\n", 96 | " :param C: Committee size\n", 97 | " :param threshold: Probability threshold\n", 98 | " :return: Smallest P meeting the criteria\n", 99 | " \"\"\"\n", 100 | " P = 1\n", 101 | " while calculate_first_p_malicious(V, C, P) >= threshold:\n", 102 | " P += 1\n", 103 | " return P" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "For V = 10000:\n", 116 | "Smallest C: 48\n", 117 | "Probability of malicious majority for C = 48: 5.35e-07\n", 118 | "Smallest P for C = 48: 13\n", 119 | "Probability of first 13 being malicious: 6.17e-07\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "# Example usage\n", 125 | "V = 10000 # Total number of validators\n", 126 | "threshold = 1e-6\n", 127 | "\n", 128 | "smallest_C = find_smallest_C(V, threshold)\n", 129 | "smallest_P = find_smallest_P(V, smallest_C, threshold)\n", 130 | "\n", 131 | "print(f\"For V = {V}:\")\n", 132 | "print(f\"Smallest C: {smallest_C}\")\n", 133 | "print(\n", 134 | " f\"Probability of malicious majority for C = {smallest_C}: {calculate_probability(V, smallest_C):.2e}\"\n", 135 | ")\n", 136 | "print(f\"Smallest P for C = {smallest_C}: {smallest_P}\")\n", 137 | "print(\n", 138 | " f\"Probability of first {smallest_P} being malicious: {calculate_first_p_malicious(V, smallest_C, smallest_P):.2e}\"\n", 139 | ")" 140 | ] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": ".venv", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.11.8" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 2 164 | } 165 | -------------------------------------------------------------------------------- /in-progress/8401-proof-timeliness/proving-phases.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/8401-proof-timeliness/proving-phases.png -------------------------------------------------------------------------------- /in-progress/8404-based-fallback.md: -------------------------------------------------------------------------------- 1 | # Based Fallback 2 | 3 | | | | 4 | | -------------------- | --------------------------------- | 5 | | Issue | [title](github.com/link/to/issue) | 6 | | Owners | @LHerskind | 7 | | Approvers | @just-mitch @aminsammara | 8 | | Target Approval Date | 2024-11-13 | 9 | 10 | ## Executive Summary 11 | 12 | We propose a method which ensure Ledger Growth, even if the committee is non cooperative. 13 | Based fallback provide us with these guarantees, assuming that the base-layer is accessible. 14 | 15 | ## Introduction 16 | 17 | The Aztec network is a Rollup on Ethereum L1, that have its own consensus layer to support build-ahead. 18 | Since the consensus layer is only there for the ability to have fast ledger growth, e.g., block times that are smaller than proving times, it should not be strictly required for the state of the chain to be progressed. 19 | Therefore, we provide a fallback mechanism to handle the case where the committee fail to perform their duties - allowing anyone to grow the ledger. 20 | 21 | The nature in which they could fail to perform their duties varies widely. 22 | It could be an attack to try and censor the network, or it might be that a majority of the network is running a node that corrupts its state the 13 of August every year as an homage to the Aztec empire. 23 | 24 | Nevertheless, we need a mechanism to ensure the liveness of the chain, even if slower, in these events. 25 | 26 | > Note: 27 | > While this is not directly dependent on forced inclusions, forced inclusions being ignored are one way that we might enter this fallback. 28 | > Thereby the fallback is practically what will ensure that forced inclusions can be included. 29 | 30 | For the fallback, we will define the following: 31 | 32 | - How will we enter the based fallback 33 | - How are blocks proposed when in this mode 34 | - They need proofs along with the proposal 35 | - How do we exit 36 | 37 | ## Interface 38 | 39 | The primary users of the fall back mode will be actors similar to the sequencers (with some expected overlap). 40 | They are block builders that wish to push a block when the chain is otherwise dead, or censored. 41 | 42 | Nevertheless, they interact with the system differently, as they have much more limited setup in which they can produce blocks, my expectation is that it is closer to an on-demand node than the usual validator setup. 43 | 44 | ## Implementation 45 | 46 | ### Entering Based Fallback 47 | 48 | The based fallback is expected to have a significantly worse user experience since proofs must be provided as well, it is therefore less likely that they will happen in quick succession - unless only a single actor will try to extend the chain. 49 | 50 | Because of this, we really do not want to enter the fallback too often, but we need to make it happen often enough that it is usable. 51 | For applications that are dependent on users acting based on external data or oracle data, a low bar to enter based mode can be desirable since it will mean that they would be able to keep running more easily. 52 | Lending and trading falls into these catagories as they usually depend on external data sources, e.g., prices of CEX'es influence how people use a trading platform and lending platforms mostly use oracles to get prices. 53 | 54 | We suggest defining the time where Based fallback can be entered to be $T_{\textsf{fallback}, \textsf{enter}}$ after the last proven block. 55 | The minimum acceptable value for $T_{\textsf{fallback}, \textsf{enter}}$ should therefore be if a committee fails to performs its proving duties as specified as a full epoch $E$ in https://github.com/AztecProtocol/engineering-designs/pull/22 * 2. 56 | 57 | The reason we want $T_{\textsf{fallback}, \textsf{enter}} > 2 E$ is fairly simple, but more easily seen with a drawing. 58 | In the figure, we mark a proven block with a gray background. 59 | We don't have skipped slot in the below diagram as I don't need that kind of complexity right now. 60 | We will be using an epoch length of $32$ slots, e.g., $E = 32$. 61 | 62 | ![alt text](images/8404/image_1.png) 63 | 64 | As you see from the image above, while the last proven block might be block 31, the time it gets proven could be in slot 63, e.g., 1 epoch later. 65 | If $T_{\textsf{fallback}, \textsf{enter}} > E$, then the minimum $T_{\textsf{fallback}, \textsf{enter}}$ would mean that we could enter based fallback at slot 64, without there even being an option for a missed proof. 66 | 67 | Lets say that we use $T_{\textsf{fallback}, \textsf{enter}} = 2E + 1$, and that no more proofs come in - then we will enter the based fallback at slot 96. 68 | 69 | ![alt text](images/8404/image_2.png) 70 | 71 | Note that this will prune the pending chain! 72 | 73 | ### Proposing blocks in based mode 74 | 75 | When the based fallback have been activated, a new set of rules starts applying for what is seen as a valid block proposal. 76 | Namely, because there are no committee to assure availability or correctness of the transactions, we rely on the proposer's claims until the proof arrives. 77 | For this our solution is simple - we require that he provides the proof along with the proposal. 78 | 79 | Note as shown in the diagram below, that the pending (unproven) blocks are pruned from the chain and the bond for epoch 2 is gone. 80 | 81 | ![alt text](images/8404/image_3.png) 82 | 83 | Beyond the proof being provided at the same time we need to also: 84 | 85 | - Allow skipping the sequencer selection - allow anyone to propose. 86 | - Relax the constraint that the slot of the block must be exactly the current slot. 87 | 88 | The reason we need to relax the constraint around the slot of the block (and likely its epoch) is fairly simple. 89 | If you are to also prove the block, you will be unable to do that within the slot. 90 | Therefore you will either need to prove something that will happen in the future, or we need to allow you more relaxed constraints on the slot. 91 | 92 | In our case, we are already constraining that the block happened `after` the last block, and that it is not in the future so we can just delete the constraint and still be partially sane. 93 | Alternatively, we can take into account the `T_fallback_enter` value such that the slot number cannot be smaller than when we entered the based fallback. 94 | 95 | Nevertheless, the proposer of a based block will yield extra power because they have more control over time than the committee members usually world. 96 | 97 | ### Exiting Based Fallback 98 | 99 | Since the based fallback as mentioned earlier (likely) provide a worse experience than the committee, we also want to exit it as soon as possible. 100 | 101 | However, this is where we start potentially having some issues. 102 | 103 | Say we stopped being in the based fallback when a pending block is proposed: 104 | 105 | - In the honest case, the consensus layer is online again, and can start performing their duties 106 | - In the dishonest case, the consensus layer wish to censor the chain, and could propose unprovable blocks, but still exiting the fallback mode. 107 | 108 | Another approach might be that we allow a committee to exit the fallback mode if they provide an epoch proof (does not need to include blocks for every slot). 109 | 110 | However, this essentially make the committee a participant in the race to propose a block in the based fallback, and as they have the overhead of consensus they would be at a disadvantage. 111 | 112 | Furthermore, if they need to build an epoch proof without there being blocks on L1 to point against, we would need to support an extra flow in the node for them. 113 | 114 | A separate direction that I would like to entertain, is that we have two ways to "get out" of the based fallback, both based on time, similarly to how we ended up in here. 115 | 116 | 1. If the last proven block is older than $T_{\textsf{fallback}, \textsf{exit}, \textsf{activity}}$ we allow exiting the based fallback. 117 | 2. After $T_{\textsf{fallback}, \textsf{exit}}$ we allow exiting the fallback regardless of when the last proven block were. 118 | 119 | Option 1, ensures that we can quickly leave the based fallback if there is no activity, allowing a good experience for the happy path. 120 | Option 2, ensures that we will not be stuck forever in based fallback even if there is a malicious entity pushing blocks once in a while. 121 | As a bonus, they are simple rules so my 🥜🧠 can deal with them. 122 | 123 | ![alt text](images/8404/image_4.png) 124 | 125 | ### Relation to forced inclusion 126 | 127 | As mentioned, one of the cases where we enter the based fallback is if the validator set is massively censoring. 128 | If the L1 contract enforce that there is a minimum block size, the committees only option to try and censor will be to entirely stall the rollup. 129 | 130 | After $T_{\textsf{fallback}, \textsf{enter}}$ time that would push us into the based fallback. 131 | At this point, anyone can push the blocks with the censored transactions. 132 | Even if the committee is trying to limit the throughput by only pushing the minimum blocks, $T_{\textsf{fallback}, \textsf{exit}}$ can then be picked such that a minimum throughput could be supported over the duration. 133 | The minimum throughput here under the assumption that either they will need to push minimum blocks as fast as they can, or someone else might push a bigger block exiting even more. 134 | 135 | ### Changes to existing architecture 136 | 137 | The changes should mostly be on the L1 contract and the sequencer client, with a minor potential change for the rollup circuit (might simply not be necessary). 138 | 139 | - The L1 contract need to deal with the flows as above. 140 | - The sequencer client need to know when fallback is activated such that it does not try to act as it normally would. 141 | - I believe simply doing nothing is best here, as the fallbacks can be seen as a fully separate type of sequencer then. 142 | - Add a new "fallback" sequencer, which skips a lot of the checks of the current sequencer but mainly build and prove blocks. 143 | - The L1 contract/rollup circuit mix needs to support individual block proofs for this specific mode. 144 | - This could be taking a single BlockRoot rollup and publishing it. 145 | 146 | ## Change Set 147 | 148 | Fill in bullets for each area that will be affected by this change. 149 | 150 | - [ ] Cryptography 151 | - [ ] Noir 152 | - [ ] Aztec.js 153 | - [ ] PXE 154 | - [ ] Aztec.nr 155 | - [ ] Enshrined L2 Contracts 156 | - [ ] Private Kernel Circuits 157 | - [x] Sequencer 158 | - [ ] AVM 159 | - [ ] Public Kernel Circuits 160 | - [x] Rollup Circuits 161 | - [x] L1 Contracts 162 | - [ ] Prover 163 | - [ ] Economics 164 | - [ ] P2P Network 165 | - [ ] DevOps 166 | 167 | ## Test Plan 168 | 169 | - A majority of the tests will be related to the L1 contracts specifically, and the full flow can be tested without requiring an additional node. 170 | - A separate E2E test running the fallback node should be tried 171 | - A network test entering and exiting fallback gracefully. 172 | 173 | ## Documentation Plan 174 | 175 | - The user documentation needs to be extended on how to run this special type of sequencer 176 | - Protocol spec needs to be extended around block building to outline these edge cases 177 | 178 | ## Rejection Reason 179 | 180 | If the design is rejected, include a brief explanation of why. 181 | 182 | ## Abandonment Reason 183 | 184 | If the design is abandoned mid-implementation, include a brief explanation of why. 185 | 186 | ## Implementation Deviations 187 | 188 | If the design is implemented, include a brief explanation of deviations to the original design. 189 | 190 | ## Disclaimer 191 | 192 | The information set out herein is for discussion purposes only and does not represent any binding indication or commitment by Aztec Labs and its employees to take any action whatsoever, including relating to the structure and/or any potential operation of the Aztec protocol or the protocol roadmap. In particular: (i) nothing in these projects, requests, or comments is intended to create any contractual or other form of legal relationship with Aztec Labs or third parties who engage with this AztecProtocol GitHub account (including, without limitation, by responding to a conversation or submitting comments) (ii) by engaging with any conversation or request, the relevant persons are consenting to Aztec Labs’ use and publication of such engagement and related information on an open-source basis (and agree that Aztec Labs will not treat such engagement and related information as confidential), and (iii) Aztec Labs is not under any duty to consider any or all engagements, and that consideration of such engagements and any decision to award grants or other rewards for any such engagement is entirely at Aztec Labs’ sole discretion. Please do not rely on any information on this account for any purpose - the development, release, and timing of any products, features, or functionality remains subject to change and is currently entirely hypothetical. Nothing on this account should be treated as an offer to sell any security or any other asset by Aztec Labs or its affiliates, and you should not rely on any content or comments for advice of any kind, including legal, investment, financial, tax, or other professional advice. 193 | -------------------------------------------------------------------------------- /in-progress/8509-prover-coordination/design.md: -------------------------------------------------------------------------------- 1 | # Prover Coordination 2 | 3 | | | | 4 | | -------------------- | ----------------------------------------------------------- | 5 | | Issue | https://github.com/AztecProtocol/aztec-packages/issues/8509 | 6 | | Owners | @just-mitch | 7 | | Approvers | @aminsammara @LHerskind @spalladino @PhilWindle | 8 | | Target Approval Date | 2024-09-18 | 9 | 10 | 11 | ## Executive Summary 12 | 13 | Presently, there is no coordination between proposers or provers. A coordination mechanism is necessary to ensure that: 14 | 15 | 1. Proposers are able to obtain proofs for epochs, considering they will likely not have proving infrastructure. 16 | 2. Provers are not obliged to "race" each other to submit proofs, as this would be a waste of resources. 17 | 18 | The protocol only "cares" about the proofs being submitted, but in practice, the node will need to coordinate the submission of proofs in a timely, efficient manner. 19 | 20 | The aztec node will provide an interface for proving marketplaces to submit quotes for proving an epoch. 21 | 22 | The pricing on the quotes will be specified in basis points, which reflect the percentage of the total TST rewards contained in the epoch that the prover will receive if a proof of the epoch is submitted in time. See [the design for proof timeliness for additional details on the L1 interface](https://github.com/AztecProtocol/engineering-designs/pull/22). 23 | 24 | The quotes will be _binding_: proposers will be able to submit a quote to the rollup contract which will stake a bond which the prover had previously deposited in escrow without additional coordination with the prover. 25 | 26 | Provers will be able to submit their proofs of an epoch to the rollup contract without additional coordination with the proposer. 27 | 28 | The _structure_ of the quote is enshrined. 29 | 30 | The _coordination_ of how a quote is obtained by a proposer is not enshrined. 31 | 32 | We propose an optional topic p2p network will be used for this coordination. 33 | 34 | We expect the community to develop alternatives for proposers to obtain quotes from provers, e.g. private relays, and proposers will be free to find quotes from any source. 35 | 36 | ## Interface 37 | 38 | Proving marketplaces will run "prover nodes", which will follow the pending chain. 39 | 40 | Prover nodes will have the ability to detect when there is an epoch to be proven. 41 | 42 | Prover nodes can submit quotes to the p2p network, which proposers can then use to claim the right to submit a proof of an epoch. 43 | 44 | To do this, the prover node will submit the following message to `/aztec/epoch-proof-quotes/0.1.0` 45 | 46 | 47 | ```solidity 48 | struct EpochProofQuote { 49 | Signature signature; 50 | uint256 epochToProve; 51 | uint256 validUntilSlot; 52 | uint256 bondAmount; 53 | uint32 basisPointFee; 54 | } 55 | ``` 56 | 57 | The `signature` will be produced using the L1 private key defined in the environment variable `PROVER_PUBLISHER_PRIVATE_KEY` to sign the message `keccak256(abi.encode(epochToProve, validUntilSlot, bondAmount, rollup, basisPointFee))`. 58 | 59 | The Proposer will be able to submit this Quote to `claimEpochProofRight` on the rollup contract. See [the design for proof timeliness](https://github.com/AztecProtocol/engineering-designs/pull/22) for more info. 60 | 61 | As an overview, L1 contracts will verify: 62 | - The quote was intended for this rollup contract 63 | - The current epoch is in the proof claim phase 64 | - There is not already a claim/proof for this epoch 65 | - The quote has been signed by a prover with an available bond 66 | - The current proposer (from the perspective of the rollup) matches `msg.sender` 67 | - The epoch on the quote is the one the rollup contract is expecting (i.e. the oldest unproven epoch) 68 | 69 | If all conditions are met, the rollup will: 70 | - bond the amount specified in the quote within the escrow contact 71 | - store the quote, and the address of the proposer 72 | 73 | When the prover submits the proof to the rollup contract, the rollup contract will pay out TST rewards to the proposer after paying the prover the basis point fee contained in the quote. It will also unstake the bond within the escrow contract. 74 | 75 | ### Concerns and Mitigations 76 | 77 | #### Public quotes 78 | 79 | Provers may not be comfortable submitting their quotes on the public p2p network, as this is effectively a first price auction, which is not ideal for the prover as it may drive down the price of their quotes. This is mitigated by the fact that this coordination mechanism is optional; a prover can stand up their own API and provide quotes to proposers directly. 80 | 81 | Performing a second price, or sealed bid auction is not deemed necessary at this time. 82 | 83 | #### Quotes without bonds 84 | 85 | A prover might submit a quote but not actually have the funds to post a bond. This is mitigated by using a custom escrow that requires the prover to deposit the bond before submitting a quote. The escrow will have a delayed withdrawal process, so a proposer can query the escrow contract, then be confident that the funds will be there when they `claimEpochProofRight`. 86 | 87 | ## Implementation 88 | 89 | ### `EpochProofQuote` 90 | 91 | EpochProofQuote needs an implementation in `circuit-types`, and needs to implement `Gossipable`. 92 | 93 | ### In memory `EpochProofQuotePool` 94 | 95 | We will need a pool of `EpochProofQuote` objects. 96 | 97 | Its initial interface will be: 98 | ```typescript 99 | interface EpochProofQuotePool { 100 | addQuote(quote: EpochProofQuote): void; 101 | getQuotes(epoch: number): EpochProofQuote[] | undefined; 102 | } 103 | ``` 104 | 105 | We will implement a `InMemoryEpochProofQuotePool` that stores quotes in memory: durable storage is not deemed necessary at this time. 106 | 107 | The implementation will only return quotes that are still valid. 108 | 109 | ### Extension to P2P Client 110 | 111 | The `P2P` interface will be extended with 112 | ```typescript 113 | interface P2P { 114 | //... 115 | 116 | sendEpochProofQuote(quote: EpochProofQuote): Promise; 117 | getEpochProofQuotes(epoch: number): Promise; 118 | } 119 | ``` 120 | 121 | The `P2PClient` will be extended with 122 | ```typescript 123 | class P2PClient { 124 | //... 125 | 126 | public async sendEpochProofQuote(quote: EpochProofQuote): Promise { 127 | const ready = await this.isReady(); 128 | if (!ready) { 129 | throw new Error('P2P client not ready'); 130 | } 131 | await this.epochProofQuotePool.addQuote(quote); 132 | // we get `propagate` "for free" by implementing `Gossipable` on `EpochProofQuote` 133 | this.p2pService.propagate(quote); 134 | } 135 | } 136 | ``` 137 | 138 | ### Extension to `LibP2PService` 139 | 140 | A new "route" needs to be added for the topic used by `EpochProofQuote` messages within `handleNewGossipMessage`. 141 | 142 | It will call `processEpochProofQuoteFromPeer`, which will add the quote to the `EpochProofQuotePool` if it is still valid. 143 | 144 | ### ProofQuoteGovernor 145 | 146 | The `ProofQuoteGovernor` is part of the Prover Node, and will be responsible for: 147 | 1. Detecting that an epoch has ended and producing a quote for the epoch 148 | 2. Detecting that the quote was accepted 149 | 150 | The Governor will sit on the main `work` loop of the prover node. 151 | 152 | Its initial interface will be: 153 | 154 | ```typescript 155 | interface ProofQuoteGovernor { 156 | ensureBond(amount: number): Promise; 157 | produceEpochProofQuote(epoch: number): Promise; 158 | } 159 | ``` 160 | 161 | When the prover node starts up, it will call `ensureBond` to ensure it has the required bond amount in escrow. 162 | 163 | The prover node will detect that an epoch has ended, and if `produceEpochProofQuote` returns a quote (not undefined), it will submit a quote to the p2p network. 164 | 165 | Separately, it needs a watcher on L1 to detect if its quote has been selected. 166 | 167 | To this end, the `L1Publisher` will be extended with a new method: 168 | 169 | ```typescript 170 | interface L1Publisher { 171 | getEpochProofClaim(): Promise; 172 | } 173 | ``` 174 | 175 | The Prover node will call this method once per L2 slot to check if its quote has been selected. 176 | 177 | If so, it will start building the proof and submit it to the rollup contract. 178 | 179 | ### ProofQuotePricingService 180 | 181 | The `ProofQuotePricingService` will be responsible for determining the basis point fee and bond amount for a quote. 182 | 183 | It will have an interface: 184 | 185 | ```typescript 186 | interface ProofDetails { 187 | totalSubproofs: number; 188 | } 189 | 190 | interface ProofPricingService { 191 | getPricing(proofDetails: ProofDetails): Promise<{ basisPointFee: number; bondAmount: number } | undefined>; 192 | } 193 | ``` 194 | 195 | The default implementation of the `ProofQuoteGovernor` will take a `ProofPricingService` in its constructor, and call `getPricing` when it needs to produce a quote. 196 | 197 | The default implementation of the `PricingServiceClient` will be to always return the same basis point fee and bond amount, which will be set in the environment variables `PROVER_BASIS_POINT_FEE` and `PROVER_BOND_AMOUNT`. 198 | 199 | ### Augment Proposer Logic 200 | 201 | With the completion of [#8576 proposers submit proof claims](https://github.com/AztecProtocol/aztec-packages/issues/8576), proposers will be able to submit proof claims to the rollup contract using quotes they produced themselves. 202 | 203 | As noted in that issue, a separate implementation will be needed to allow proposers to submit proof claims using quotes they received from provers. 204 | 205 | The tentative interfaces for this are: 206 | 207 | ```typescript 208 | interface EpochProofQuoteSource { 209 | getEpochProofQuotes(epoch: number): Promise; 210 | } 211 | 212 | interface EpochProofQuoteAggregator { 213 | setSources(sources: EpochProofQuoteSource[]): void; 214 | getQuote(epoch: number): Promise; 215 | } 216 | ``` 217 | 218 | Thus, `P2P` is a `EpochProofQuoteSource`, and the `EpochProofQuoteAggregator` will be used by the proposer to obtain quotes from the p2p network. 219 | 220 | 221 | ### JSON-RPC Fallback 222 | 223 | To support testing, we will create a JSON-RPC endpoint on the sequencers that prover nodes can send quotes to. 224 | 225 | This will manifest in two places. 226 | 227 | #### Update JSON-RPC of Sequencer 228 | 229 | ```typescript 230 | interface AztecNode { 231 | // ... 232 | addEpochProofQuote(quote: EpochProofQuote): Promise; 233 | } 234 | ``` 235 | 236 | Under the hood, this will inject the quote into the `EpochProofQuotePool`. 237 | 238 | #### Update Prover Node 239 | 240 | When the `ProofQuoteGovernor` produces a quote, it will send it to the node specified in its `AZTEC_NODE_URL` using the JSON-RPC endpoint in addition to the p2p network. 241 | 242 | 243 | ## Future Work 244 | 245 | ### Stricter ProofQuoteGovernor 246 | 247 | The `ProofQuoteGovernor` should be updated to only produce a quote (and ask the pricing service for pricing) if it is convinced it has all the data required to produce a proof. 248 | 249 | ### Pricing Service accepts endpoint configuration 250 | 251 | The pricing service should be able to accept an endpoint configuration, so that the prover can use a third party to determine the pricing of their quotes. 252 | 253 | In this case, if the endpoint is defined, the `ProofPricingService` will call the endpoint to determine the pricing. 254 | Otherwise it will use the default, static pricing. 255 | 256 | ### Peer Scoring 257 | 258 | If a peer propagates a quote that is not valid, we will penalize their peer score. 259 | 260 | ### Proof Production Pre-confirmations 261 | 262 | It would be ideal if provers could start proving an epoch before it has actually ended. To this end we can envision a separate out-of-protocol mechanism where a prover can submit a quote for the current epoch, potentially with additional metadata. 263 | 264 | A proposer can "promise" to select the quote. 265 | 266 | This gives the prover assurances to start proving the epoch before it has ended, which would reduce the lag time between the end of the epoch and the submission of the proof. 267 | 268 | 269 | ## Change Set 270 | 271 | Fill in bullets for each area that will be affected by this change. 272 | 273 | - [ ] Cryptography 274 | - [ ] Noir 275 | - [ ] Aztec.js 276 | - [ ] PXE 277 | - [ ] Aztec.nr 278 | - [ ] Enshrined L2 Contracts 279 | - [ ] Private Kernel Circuits 280 | - [x] Sequencer 281 | - [ ] AVM 282 | - [ ] Public Kernel Circuits 283 | - [ ] Rollup Circuits 284 | - [x] L1 Contracts 285 | - [ ] Prover 286 | - [x] Economics 287 | - [x] P2P Network 288 | - [ ] DevOps 289 | 290 | ## Test Plan 291 | 292 | This is a high-level test plan. More detailed test plans will be created as part of the implementation. 293 | 294 | Generally, tests will be needed around the following: 295 | 296 | 1. Prover's Submission of Epoch Proof Quotes 297 | 2. Prover's Bond Management 298 | 3. P2P Network Handling of Epoch Proof Quotes 299 | 4. Proposer's Retrieval and Handling of Quotes 300 | 5. Proposer's Submission of Proof Claims to the Rollup Contract 301 | 6. Prover's Detection of Quote Acceptance and Proof Submission 302 | 7. Rollup Contract's Processing of Proof Claims and Proofs 303 | 8. Payout of TST Rewards 304 | 305 | The stress test we are building toward is: 306 | 20 actors representing proving marketplaces can submit bids, that at most 1 bid can be accepted on L1 during, the bid can only be accepted during the proof claim period of a prescribed epoch, the bid is only actionable on L1 if the prover has the requisite bond in TST, and that all the above works when there are 300 nodes on the p2p network, and the chain is operating at 1TPS with proving turned on. 307 | 308 | ## Documentation Plan 309 | 310 | Provers will need documentation on how to set up their node to submit quotes to the p2p network. 311 | 312 | ## Rejection Reason 313 | 314 | If the design is rejected, include a brief explanation of why. 315 | 316 | ## Abandonment Reason 317 | 318 | If the design is abandoned mid-implementation, include a brief explanation of why. 319 | 320 | ## Implementation Deviations 321 | 322 | If the design is implemented, include a brief explanation of deviations to the original design. 323 | 324 | ## Disclaimer 325 | 326 | The information set out herein is for discussion purposes only and does not represent any binding indication or commitment by Aztec Labs and its employees to take any action whatsoever, including relating to the structure and/or any potential operation of the Aztec protocol or the protocol roadmap. In particular: (i) nothing in these projects, requests, or comments is intended to create any contractual or other form of legal relationship with Aztec Labs or third parties who engage with this AztecProtocol GitHub account (including, without limitation, by responding to a conversation or submitting comments) (ii) by engaging with any conversation or request, the relevant persons are consenting to Aztec Labs’ use and publication of such engagement and related information on an open-source basis (and agree that Aztec Labs will not treat such engagement and related information as confidential), and (iii) Aztec Labs is not under any duty to consider any or all engagements, and that consideration of such engagements and any decision to award grants or other rewards for any such engagement is entirely at Aztec Labs’ sole discretion. Please do not rely on any information on this account for any purpose - the development, release, and timing of any products, features, or functionality remains subject to change and is currently entirely hypothetical. Nothing on this account should be treated as an offer to sell any security or any other asset by Aztec Labs or its affiliates, and you should not rely on any content or comments for advice of any kind, including legal, investment, financial, tax, or other professional advice. 327 | -------------------------------------------------------------------------------- /in-progress/8754-slashing-staking/Contract_Overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/8754-slashing-staking/Contract_Overview.png -------------------------------------------------------------------------------- /in-progress/8757-fees/notebook/.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info -------------------------------------------------------------------------------- /in-progress/8757-fees/notebook/.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /in-progress/8757-fees/notebook/README.md: -------------------------------------------------------------------------------- 1 | Go look in the `fee-model.ipynb` notebook for the actual simulation and fee model. 2 | 3 | The setup is created with `uv`, so you should be good to run `uv sync` to get setup. Consider running `uv venv` to create a virtual environment first. You can add the kernel for the notebook by running `python -m ipykernel install --user --name=fee-model --display-name "Python (fee-model)"` and then selecting the kernel in the notebook settings. 4 | -------------------------------------------------------------------------------- /in-progress/8757-fees/notebook/ape-config.yaml: -------------------------------------------------------------------------------- 1 | name: fee-modelling 2 | 3 | default_ecosystem: ethereum 4 | 5 | node: 6 | ethereum: 7 | mainnet: 8 | uri: http://localhost:8545 9 | -------------------------------------------------------------------------------- /in-progress/8757-fees/notebook/blocks.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/8757-fees/notebook/blocks.pkl -------------------------------------------------------------------------------- /in-progress/8757-fees/notebook/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "simple-fee-model" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "eth-ape[recommended-plugins]", 9 | "ipykernel>=6.29.5", 10 | "matplotlib>=3.9.2", 11 | "numpy>=1.26.4", 12 | "pydantic>=2.9.2", 13 | ] 14 | -------------------------------------------------------------------------------- /in-progress/images/7482-contracts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/images/7482-contracts.png -------------------------------------------------------------------------------- /in-progress/images/8404/image_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/images/8404/image_1.png -------------------------------------------------------------------------------- /in-progress/images/8404/image_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/images/8404/image_2.png -------------------------------------------------------------------------------- /in-progress/images/8404/image_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/images/8404/image_3.png -------------------------------------------------------------------------------- /in-progress/images/8404/image_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/images/8404/image_4.png -------------------------------------------------------------------------------- /in-progress/proving-queue/0005-proving-queue.md: -------------------------------------------------------------------------------- 1 | | | | 2 | | -------------------- | --------------------------------------------------------------------------------------------- | 3 | | Owners | @PhilWindle | 4 | | Approvers | @just-mitch @alexghr @spalladino | 5 | | Target Approval Date | 2024-08-23 | 6 | 7 | 8 | ## Executive Summary 9 | 10 | This design attempts to solve the problem of scaling proof generation without the introduction of any new technologies or dependencies 11 | 12 | 13 | ## Introduction 14 | 15 | We need to prove blocks of transactions in a timely fashion. These blocks will likely consist of 10s of thousands of proofs of varying computational complexity. We will likely need to orchestrate the proving across 1000's of machines to achieve the timeliness requirements. To achieve this, we would ideally like to not introduce any further technological dependencies. 16 | 17 | 18 | ## Overall Architecture 19 | 20 | The overall architecture of the proving subsystem is given in the following diagram. The orchestrator understands the process of proving a full block and distills the proving process into a stream of individual proof requests. These requests can be thought of as 'jobs' pushed to a queueing abstraction. Then, once a proof has been produced a callback is inovked notifying the orchestrator that the proof is available. The dotted line represents this abstraction, an interface behind which we want to encourage the development of alternative methods of distributing these jobs. In this diagram the 'Prover' is an entity responsible for taking the requests and further distributing them to a set of individual proving agents. 21 | 22 | In this architecture it is important to understand the seperation of concerns around state. The orchestrator must maintain a persisted store describing the overall proving state of the block such that if it needs to restart it can continue where it left off and doesn't need to restart the block from scratch. This state however will not include the in-progress state of every outstanding proving job. This is the responsibility of the state managed by the prover. This necessitates that once the `Proof Request` has been accepted by the prover, the information backing that request has been persisted. Likewise, the `Proof Result` acknowldgement must be persisted before completion of the callback. This ensures no proving jobs can be lost in the cracks. It is always possible that a crash can occur for example after the `Proof Reault` is accepted and before it has been removed from the prover's store. For this reason, duplicate requests in either direction must be idempotent. 23 | 24 | ![Proving Architecture](./proving-arch.png) 25 | 26 | ### A Simple but Sufficient Prover Implementation 27 | 28 | Frankly, there are many ways in which the `Prover` component can be built. It is a common pattern within Web 2 and many technologies exist to help. Ideally however we would like to reduce our dependencies as much as possible, so a solution relying solely on existing dependencies would be desirable. First, let us look at our high level requirements: 29 | 30 | 1. Proving a single block of 11520 transactions (10 TPS for 32 blocks where each block is 3 ethereum blocks) 31 | 2. 3 public functions per transaction results in the other of 100,000 proof requests. 32 | 3. Depending on timeliness requirements and proof computational complexity, we should expect the number of prover agents to be up to 10,000. 33 | 34 | ### Other Possibilities Considered 35 | 36 | We considered using RabbitMQ. This is a well supported persisted message queue implementation. However, like most queueing implementations, it's priorities are related to resilience, scalability and capacity. It can distribute a large number of jobs across a large number of clients very quickly. It also offers redundant setups etc. Also like most queueing implementations, it lacks flexibility. For example, it's possible that proving agents won't be homogenous. A prover may have access to a variety of machines with differing hardware capabilities meaning greater flexibility is needed when it comes to retrieving jobs and distributing them. This type of flexibility isn't generally supported by off the shelf queueing implementations. 37 | 38 | ### LMDB Backed Proof Request Store 39 | 40 | We propose creating a proving job broker as a Node.js server using an LMDB database as it's persistent store. Crucially, we will make a number of compromises to achieve this. 41 | 42 | #### Broker Architecture and Implementation 43 | 44 | The broker exists as an HTTP service using an LMDB DB to store an index of proving jobs. The index is a mapping from the job id to the job's metadata. The job metadata can be summarised by the following: 45 | 46 | ``` TS 47 | type Metadata = { 48 | proofType: PROOF_TYPE; //(BASE_ROLLUP etc...) 49 | epochNumber: number; 50 | status: // (IN_PROGRESS or COMPLETED) 51 | } 52 | 53 | ``` 54 | 55 | Proving request data, such as input witness and recursive proofs are stored in a directory labelled with the job's id residing on an NFS share/S3 bucket or similar. This is an optimsation, as prover agents can access the data independently, without requiring the broker to transfer large amounts of data to them. If it turns out that this is not required then the proof requests will reside on a disk local to the broker and will be transferred over the network from the broker. Maybe this should be a configurable aspect of the system. 56 | 57 | ![alt text](./broker.png) 58 | 59 | The entire proof request index DB is loaded into memory upon service start. 60 | 61 | Prover agents periodically poll the broker to either request more work or to update the broker that they are still working on a specific job. As there are a large number of clients we will need to limit the frequency of that polling to, perhaps every 10 seconds. 62 | 63 | This gives an overall job flow that looks as follows: 64 | 65 | 1. The broker receives a `Proof Request` for job id #4567 from the orchestrator. 66 | 2. The broker persists the proof's input data to disk and then inserts an entry into the index DB. Finally it creates an entry in the in-memory cache of jobs. 67 | 3. An agent polls for new work. A query is performed on the in-memory cache based on the capabilites of the prover and ordered to prioritise earlier epochs, discounting all jobs that are already being proven. 68 | 4. The query returns the job #4567. 69 | 5. The broker stores the prover agent id and the current time against the job as `Start Time`. 70 | 6. The broker returns the job details to the agent. 71 | 72 | Later, the agent polls again: 73 | 74 | 1. The agent polls the broker with the job id and job start time 75 | 2. The broker finds the job in the in-memory cache 76 | 3. If the job is being worked on by another agent then the start time against the job is compared to the start time in the agent's request. The earlier time wins and the cache is updated if necessary with the winner's details. If the polling agent loses then the agent is told to stop working on the job and is given a new job. 77 | 4. The broker adds the current time to the job as `Last Update Time` 78 | 79 | Later, the agent polls to say the job is completed: 80 | 81 | 1. The agent has completed the job and immediately polls with the result. 82 | 2. The results are stored and the job is updated as completed, probably moved to a different in memory cache. 83 | 84 | The reason for the start time negotiation around agents is that we need to consider a broker crash/restart. 85 | 86 | Upon restart, the broker should: 87 | 88 | 1. Re-build the in-memory cache from the index. 89 | 2. Start accepting new job requests from agents. 90 | 91 | The above means that a job that is already being worked could be given to another agent after a restart. All of the data around current proving jobs is only in memory so is lost upon process termination. By negotiating based on start time, soon after restart the system will revert to how it was without too much duplicated effort. Of course, for small proofs, it is still possible that a proof could be completed twice. This seems an acceptable trade-off. 92 | 93 | Finally, the broker periodically checks all current jobs to see if their `Last Update Time` is beyond a certain window. If so, the details of the agent proving the job are removed, so they become available again. 94 | 95 | The described interactions should mean that we maintain a queue of jobs, prioritised in whatever way we need, queryable by however we require whilst only using a simple LMDB store and directory structure. By doing all of this in memory we drastically reduce the amount of DB access required at the expense of potentially some duplicated effort and negotiation upon broker restart (something we hope is a rare occurence). Even if we consider a worst case scenario of ~200,000 outstanding proof requests, this should not require more than a few 10's MB of memory to cache. One potential concern is performance. There will be a large number of prover agents querying for work and these queries will need to be very efficient, but this will be the case with any system. 96 | 97 | The last step is that the broker pushes all completed jobs back to the orchestrator, shortly after they have been completed but asynchronously to the completion message from the agent. The job is removed from both the directory listing and the index DB. When the queue is empty, a check is performed that the proof request directory is empty. Any remaining data is deleted. -------------------------------------------------------------------------------- /in-progress/proving-queue/broker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/proving-queue/broker.png -------------------------------------------------------------------------------- /in-progress/proving-queue/proving-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/proving-queue/proving-arch.png -------------------------------------------------------------------------------- /in-progress/world-state/append-only-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/world-state/append-only-tree.png -------------------------------------------------------------------------------- /in-progress/world-state/current-tree-state-structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/world-state/current-tree-state-structure.png -------------------------------------------------------------------------------- /in-progress/world-state/first-prune.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/world-state/first-prune.png -------------------------------------------------------------------------------- /in-progress/world-state/historic-hash-path.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/world-state/historic-hash-path.png -------------------------------------------------------------------------------- /in-progress/world-state/image-structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/world-state/image-structure.png -------------------------------------------------------------------------------- /in-progress/world-state/reference-counting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/world-state/reference-counting.png -------------------------------------------------------------------------------- /in-progress/world-state/second-prune.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/world-state/second-prune.png -------------------------------------------------------------------------------- /in-progress/world-state/snapshot-tree-structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/in-progress/world-state/snapshot-tree-structure.png -------------------------------------------------------------------------------- /prd.template.md: -------------------------------------------------------------------------------- 1 | # [Project Name] Project Requirements Document 2 | 3 | - Owner: 4 | - Approvers: 5 | - @[someone on product] 6 | - @[engineering team lead] 7 | - @[devrel engineer] 8 | - @[does this need the legal team?] 9 | - @[does this need the finance team?] 10 | - @[does this need the sales team?] 11 | - Target PRD Approval Date: YYYY-MM-DD 12 | - Target Project Delivery Date: YYYY-MM-DD 13 | 14 | ## Key words 15 | 16 | The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC 2119](https://datatracker.ietf.org/doc/html/rfc2119). 17 | 18 | ## Background 19 | 20 | Provide a high-level background for the project. Add links to any relevant documents rather than duplicating information (note, if linking to github, make sure to include the commit hash). 21 | 22 | Explain what the status quo is and why it is inadequate. 23 | 24 | ## Key assumptions and dependencies 25 | 26 | List the key assumptions made in creating the requirements. 27 | 28 | List the key dependencies on other work streams or requirements. 29 | 30 | For example, we assume that feature X will be complete by Y date, and that it will be cheaper to do Z than V. 31 | 32 | ## Desired User Flow(s) 33 | 34 | Provide a high-level overview of the desired user flow. 35 | 36 | There may be multiple user flows, and each flow should be in its own section. 37 | 38 | Place the most important user flows first. 39 | 40 | For example, if the project is about ensuring that upgrading to a new version of the rollup is "easy", focus on **who** you most want it to be easy for, and **what experience** you want them to have. 41 | 42 | Each flow should discuss why you believe the user demands (or will tolerate) this experience. 43 | 44 | ### (Example) Easy Upgrades for End Users 45 | 46 | When a new canonical version of the rollup is released, end users (e.g., wallet users) expect... 47 | 48 | We know this because... 49 | 50 | ### (Example) Easy Upgrades for Validators 51 | 52 | When a new canonical version of the rollup is released, validator operators expect... 53 | 54 | We know this because... 55 | 56 | ## Requirements 57 | 58 | Describe all the requirements you know of for the project which are relevant given the delivery deadline. 59 | 60 | For example, if we were speccing an upgrade mechanism for a "regular database", we might have the following requirements: 61 | 62 | ### Functional Requirements (what the system does) 63 | 64 | #### (Example) Data Migration Utility 65 | 66 | - What: The system MUST include a utility that automatically migrates legacy schemas and data to the new database format. 67 | - Why: To ensure a seamless transition with minimal manual intervention, preventing data loss or inconsistencies during the upgrade. 68 | - Where: Derived from stakeholder interviews and an analysis of the current system’s limitations. 69 | 70 | #### (Example) Backup and Rollback Mechanism 71 | 72 | - What: The system MUST create a complete backup of the existing database and provide a rollback option in case of upgrade failures. 73 | - Why: To safeguard against potential data corruption or upgrade errors, ensuring business continuity. 74 | - Where: Based on IT risk assessments and business continuity planning guidelines. 75 | 76 | ### Non-Functional Requirements (qualities the system has) 77 | 78 | #### (Example) Security Compliance 79 | 80 | - What: The upgrade process MUST enforce encryption and strict access control measures to protect data in transit and at rest. 81 | - Why: To protect sensitive information against unauthorized access and to comply with data protection regulations. 82 | - Where: Sourced from regulatory requirements (e.g., GDPR, HIPAA) and security audits. 83 | 84 | #### (Example) Maintainability 85 | 86 | - What: Upgrade scripts and procedures SHOULD be modular and well-documented to facilitate future maintenance and troubleshooting. 87 | - Why: To reduce long-term maintenance costs and ensure the system can be easily updated or modified. 88 | - Where: Informed by best practices in IT service management and lessons learned from previous upgrade projects. 89 | 90 | ### Performance Requirements 91 | 92 | #### (Example) Upgrade Speed 93 | 94 | - What: The migration process MUST complete within a 4-hour maintenance window. 95 | - Why: To minimize system downtime and avoid disruption to business operations during the upgrade. 96 | - Where: Based on service level agreements (SLAs) and operational constraints set by the IT department. 97 | 98 | #### (Example) Post-Upgrade Query Performance 99 | 100 | - What: Average query response times MUST NOT exceed 200 milliseconds under normal load conditions. 101 | - Why: To ensure that system performance meets user expectations and maintains efficient operations. 102 | - Where: Derived from performance benchmarks established during testing and requirements from the performance engineering team. 103 | - NOTE: Within the next 12 months, we will want this to be 100 milliseconds. 104 | 105 | ## Tradeoff Analysis 106 | 107 | Include commentary on where the ideal candidate solution should sit in a tradeoff space, to give guidance how different candidate solutions will be compared. For example, stating that between two candidate solutions, the one that has the lower "cost measured in X" will be preferred. 108 | 109 | ## Disclaimer 110 | 111 | The information set out herein is for discussion purposes only and does not represent any binding indication or commitment by Aztec Labs and its employees to take any action whatsoever, including relating to the structure and/or any potential operation of the Aztec protocol or the protocol roadmap. In particular: (i) nothing in these projects, requests, or comments is intended to create any contractual or other form of legal relationship with Aztec Labs or third parties who engage with this AztecProtocol GitHub account (including, without limitation, by responding to a conversation or submitting comments) (ii) by engaging with any conversation or request, the relevant persons are consenting to Aztec Labs’ use and publication of such engagement and related information on an open-source basis (and agree that Aztec Labs will not treat such engagement and related information as confidential), and (iii) Aztec Labs is not under any duty to consider any or all engagements, and that consideration of such engagements and any decision to award grants or other rewards for any such engagement is entirely at Aztec Labs’ sole discretion. Please do not rely on any information on this account for any purpose - the development, release, and timing of any products, features, or functionality remains subject to change and is currently entirely hypothetical. Nothing on this account should be treated as an offer to sell any security or any other asset by Aztec Labs or its affiliates, and you should not rely on any content or comments for advice of any kind, including legal, investment, financial, tax, or other professional advice. 112 | -------------------------------------------------------------------------------- /rejected/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AztecProtocol/engineering-designs/3e4d81b5785732ba38ca639392a4c6d49e765924/rejected/.gitkeep -------------------------------------------------------------------------------- /scripts/redo-typo-pr: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eux 4 | 5 | # Configuration 6 | ORIGINAL_PR_NUMBER=$1 7 | REPO='AztecProtocol/engineering-designs' 8 | NEW_BRANCH="chore/typo-redo-$ORIGINAL_PR_NUMBER" 9 | AUTHOR=`gh pr view $ORIGINAL_PR_NUMBER --json author --jq '.author.login'` 10 | 11 | # Step 1: Checkout the PR locally 12 | echo "Checking out PR #$ORIGINAL_PR_NUMBER" 13 | gh pr checkout $ORIGINAL_PR_NUMBER --branch "typo-pr-branch" 14 | 15 | # Step 2: Create squash commit on main 16 | echo "Squashing PR branch onto main" 17 | git checkout main 18 | git merge "typo-pr-branch" --squash 19 | 20 | # Step 3: Commit squash commit to new branch 21 | echo "Creating new local branch $NEW_BRANCH" 22 | git checkout -b $NEW_BRANCH 23 | git commit -a --author="AztecBot " -m "chore: redo typo PR" 24 | 25 | # Step 4: Push the new branch to GitHub 26 | echo "Pushing new branch $NEW_BRANCH to GitHub" 27 | git push origin $NEW_BRANCH 28 | 29 | # Step 5: create a new pull request 30 | echo "Creating a new pull request for $NEW_BRANCH" 31 | gh pr create --base main --head $NEW_BRANCH --title "chore: redo typo PR by $AUTHOR" --body "Thanks $AUTHOR for https://github.com/$REPO/pull/$ORIGINAL_PR_NUMBER. Our policy is to redo typo changes to dissuade metric farming. This is an automated script." 32 | 33 | # Step 6: Close the original PR 34 | echo "Closing original PR #$ORIGINAL_PR_NUMBER" 35 | gh pr close $ORIGINAL_PR_NUMBER 36 | 37 | echo "Script completed." 38 | --------------------------------------------------------------------------------