├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── PULL_REQUEST_TEMPLATE.yml
├── dependabot.yml
├── labeler.yml
└── workflows
│ ├── code_quality_control.yml
│ ├── cos_integration.yml
│ ├── docs.yml
│ ├── docs_test.yml
│ ├── label.yml
│ ├── lints.yml
│ ├── pr_request_checks.yml
│ ├── pull-request-links.yml
│ ├── pylint.yml
│ ├── python-publish.yml
│ ├── quality.yml
│ ├── ruff.yml
│ ├── run_test.yml
│ ├── stale.yml
│ ├── test.yml
│ ├── testing.yml
│ ├── unit-test.yml
│ └── welcome.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── agorabanner.png
├── docs
├── .DS_Store
├── applications
│ ├── customer_support.md
│ ├── enterprise.md
│ └── marketing_agencies.md
├── architecture.md
├── assets
│ ├── css
│ │ └── extra.css
│ └── img
│ │ ├── SwarmsLogoIcon.png
│ │ ├── swarmsbanner.png
│ │ ├── tools
│ │ ├── output.png
│ │ ├── poetry_setup.png
│ │ └── toml.png
│ │ └── zetascale.png
├── bounties.md
├── contributing.md
├── demos.md
├── design.md
├── examples
│ ├── count-tokens.md
│ ├── index.md
│ ├── load-and-query-pinecone.md
│ ├── load-query-and-chat-marqo.md
│ ├── query-webpage.md
│ ├── store-conversation-memory-in-dynamodb.md
│ ├── talk-to-a-pdf.md
│ ├── talk-to-a-webpage.md
│ ├── talk-to-redshift.md
│ └── using-text-generation-web-ui.md
├── faq.md
├── flywheel.md
├── hiring.md
├── index.md
├── metric.md
├── overrides
│ └── main.html
├── purpose.md
├── research.md
├── roadmap.md
├── stylesheets
│ └── extra.css
└── zeta
│ ├── .DS_Store
│ ├── index.md
│ ├── nn
│ ├── architecture
│ │ ├── decoder.md
│ │ └── transformer.md
│ ├── attention
│ │ ├── base.md
│ │ ├── flash2.md
│ │ ├── flash_attention.md
│ │ ├── multihead.md
│ │ └── multiquery.md
│ ├── biases
│ │ ├── alibi.md
│ │ ├── relative_bias.md
│ │ └── xpos.md
│ ├── embeddings
│ │ ├── multiway.md
│ │ ├── rope.md
│ │ └── truncated_rope.md
│ ├── modules
│ │ ├── lora.md
│ │ └── token_learner.md
│ └── utils
│ │ └── helpers.md
│ ├── tokenizers
│ ├── language_tokenizer.md
│ ├── multi_modal_tokenizer.md
│ └── sentencepiece.md
│ └── training
│ ├── nebula.md
│ ├── optimizers
│ ├── decoupled_lion.md
│ └── sophia.md
│ └── train.md
├── example.py
├── mkdocs.yml
├── package
├── __init__.py
├── main.py
└── subfolder
│ ├── __init__.py
│ └── main.py
├── pyproject.toml
├── requirements.txt
└── scripts
├── code_quality.sh
├── merge_all_prs.sh
├── test_name.sh
└── tests.sh
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [kyegomez]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13 | custom: #Nothing
14 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a detailed report on the bug and it's root cause. Conduct root cause error analysis
4 | title: "[BUG] "
5 | labels: bug
6 | assignees: kyegomez
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is and what the main root cause error is. Test very thoroughly before submitting.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Additional context**
27 | Add any other context about the problem here.
28 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: 'kyegomez'
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.yml:
--------------------------------------------------------------------------------
1 |
10 |
11 | Zeta provides you with reliable, high performance, and fast modular building blocks for building zeta scale neural nets at lightspeed with minimal code and a pythonic API.
12 |
13 | [Click here for Zeta Documentation →](zeta/)
14 |
15 |
16 | ## Examples
17 |
18 | Check out Zeta examples for building agents, data retrieval, and more.
19 |
20 | [Checkout Zeta examples →](examples/)
21 |
--------------------------------------------------------------------------------
/docs/metric.md:
--------------------------------------------------------------------------------
1 | # The Golden Metric:
2 |
3 | * We need to figure out a single metric that determines if we're accomplishing our goal with zeta which is to build zetascale superintelligent AI models as fast as possible with minimal code.
4 |
5 |
--------------------------------------------------------------------------------
/docs/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 |
3 |
4 |
5 | {% block announce %}
6 |
9 | {% endblock %}
--------------------------------------------------------------------------------
/docs/purpose.md:
--------------------------------------------------------------------------------
1 | # Zeta's Purpose
2 |
3 |
4 | Eevery once in a while, a revolutionary project comes along that changes everything.
5 |
6 | A landscape cluttered by rigid frameworks, plagued by inefficiencies, and where developers - our brightest minds - are bogged down by limitations.
7 |
8 | Now, imagine a world where harnessing the power of state-of-the-art models isn't just possible... it's simple. A world where efficiency doesn’t sacrifice safety, and where your ideas are bounded only by your imagination. We should be living in this world. But we aren't.
9 |
10 |
11 | Because Zeta is what's missing.
12 |
13 |
14 | The challenge? Creating a framework that's not just another tool, but a revolution.
15 |
16 | To bridge this gap, one would need to optimize at the foundational level, prioritize user experience, and introduce a design philosophy that future-proofs. It's colossal. And until now, no one's even come close.
17 |
18 |
19 | But there’s an enormous opportunity here. An opportunity that promises not just recognition but the power to redefine an industry. And, the key to unlocking this future? It's been with us all along.
20 |
21 |
22 | Insight.
23 |
24 |
25 | Introducing... Zeta.
26 |
27 |
28 | Our secret? Fluidity.
29 |
30 | It’s a philosophy that values modularity, reliability, usability, and unmatched speed.
31 |
32 | But more than that, it's a commitment to evolution, to pushing boundaries, to never settling.
33 |
34 |
35 | Why are we the best to execute this vision?
36 |
37 | Because we've been there from the start.
38 |
39 | We've seen the challenges, felt the frustrations, and now, we're poised to lead the revolution.
40 |
41 | We’ve done it before, and with Zeta, we’re doing it again.
42 |
43 |
44 | Zeta isn’t just the next step. It's a leap into the future.
45 |
46 | Zeta is the future of AI.
47 |
48 |
--------------------------------------------------------------------------------
/docs/roadmap.md:
--------------------------------------------------------------------------------
1 |
2 | **[Zeta's 3-Step Master Plan for Perfecting Multi-Modality LLMs]**
3 |
4 | ---
5 |
6 | **1. Refinement and Excellence: Perfecting the Framework**
7 | - **[Objective]**: To develop Zeta into the most sophisticated, yet intuitively simple framework for building Multi-Modality LLMs.
8 |
9 | - **[Strategies]**
10 | - **Zeta Innovation Labs**:
11 | * Create a dedicated team of experts who exclusively focus on refining the foundational modules and blocks.
12 | * Prioritize research in areas like advanced self-supervised learning, multi-modal integration, and zero-shot learning.
13 | - **Modularity Focus**:
14 | * Develop plug-and-play modules that allow developers to effortlessly incorporate various data types (text, image, video, audio) into their LLMs.
15 | * Standardize the blocks ensuring consistent performance, error-handling, and interoperability.
16 | - **Performance Optimization**:
17 | * Collaborate with hardware manufacturers to ensure that Zeta is perfectly optimized for cutting-edge GPUs, TPUs, and other specialized hardware.
18 | * Roll out regular updates to keep the framework at the forefront of performance.
19 |
20 | ---
21 |
22 | **2. User-Centric Development: Making Zeta Intuitive**
23 | - **[Objective]**: Ensure that every feature, tool, and module in Zeta aligns with the principle of making LLM creation simpler and more efficient.
24 |
25 | - **[Strategies]**
26 | - **Zeta Academy**:
27 | * Host frequent workshops and webinars targeted at educating users on harnessing the power of Zeta's multi-modality LLM features.
28 | * Create a vast library of tutorials, ranging from beginner to advanced, with real-world examples of LLM implementation.
29 | - **Interactive GUI for LLM Design**:
30 | * Develop a visual interface where users can drag-and-drop modules, visualize their LLM architecture, and see real-time performance metrics.
31 | - **Feedback Loops**:
32 | * Create a robust system to collect and implement feedback. Users should feel like they’re co-creating Zeta.
33 | * Launch a beta program where selected developers can test new features and provide insights.
34 |
35 | ---
36 |
37 | **3. Scaling and Outreach: From the Labs to the World**
38 | - **[Objective]**: Make Zeta the de facto choice for developers worldwide aiming to craft state-of-the-art Multi-Modality LLMs.
39 |
40 | - **[Strategies]**
41 | - **Zeta Ambassadors**:
42 | * Identify and collaborate with top AI researchers and practitioners globally, making them the face and voice of Zeta in their communities.
43 | - **Strategic Partnerships**:
44 | * Work closely with major tech institutions, universities, and platforms to integrate Zeta into their curriculum or platforms.
45 | * Create an API gateway for seamless integration of Zeta with other popular machine learning and data processing platforms.
46 | - **Global Challenges & Competitions**:
47 | * Organize worldwide LLM challenges, where developers use Zeta to solve real-world problems, bringing attention to both the problems and the capabilities of Zeta.
48 |
49 | ---
50 |
51 |
52 | In every tool, in every line of code, in every module of Zeta, you'll find our relentless pursuit of excellence. But remember, at its core,
53 |
54 | Zeta isn't about us,
55 |
56 | it's about you, the creator.
57 |
58 | It's about giving you the power, the simplicity, and the edge to redefine the boundaries of what's possible.
59 |
60 | With Zeta, we’re not just building a tool; we're crafting the future.
61 |
62 | A future we're eager to see through your eyes.
63 |
64 |
65 |
66 |
67 | ------
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 | **[Zeta's 3-Step Master Plan]**
92 |
93 | **1. Cultivate an Ecosystem of Innovation**
94 | - **[Objective]**: Establish an environment where creativity and innovation are paramount.
95 |
96 | - **[Strategies]**
97 | - **Education & Outreach**:
98 | * Launch a series of free online courses, workshops, and webinars to educate developers on the capabilities and advantages of Zeta.
99 | * Partner with top universities and institutions, offering them early access and integrations, fostering a new generation of developers natively trained on Zeta.
100 | - **Zeta Labs**:
101 | * Open a research lab committed to pushing the boundaries of what neural networks can achieve.
102 | * Provide grants, resources, and mentorship to promising projects and startups that choose to build with Zeta.
103 | - **Open Source Philosophy**:
104 | * Release parts of Zeta's core codebase to the public, inviting developers worldwide to contribute, refine, and expand upon the framework.
105 | * Organize hackathons and coding challenges to galvanize the community around real-world problems that Zeta can solve.
106 |
107 | ---
108 |
109 | **2. Seamless Integration & Scalability**
110 | - **[Objective]**: Make Zeta the easiest, most efficient, and most scalable framework to integrate into any project or system.
111 |
112 | - **[Strategies]**
113 | - **Developer Toolkits**:
114 | * Release a suite of tools, plugins, and libraries for all major development platforms and languages, ensuring Zeta is accessible to everyone, everywhere.
115 | - **Zeta Cloud**:
116 | * Offer a cloud solution that allows developers to run, test, and deploy their neural networks seamlessly. This ensures businesses of all sizes can scale without friction.
117 | - **Partnerships**:
118 | * Collaborate with major tech companies, ensuring Zeta's native support on platforms like AWS, Google Cloud, and Azure.
119 | * Establish alliances with hardware manufacturers, optimizing Zeta for the latest GPUs and Neural Network Processors.
120 |
121 | ---
122 |
123 | **3. Build a Community and Cultivate Trust**
124 | - **[Objective]**: Establish Zeta as more than a tool – it should be a movement, a community of forward-thinkers who believe in redefining the boundaries of neural network capabilities.
125 |
126 | - **[Strategies]**
127 | - **ZetaCon**:
128 | * Annually host a global conference (both offline and online) bringing together the brightest minds in the AI and machine learning sector. It will be a platform for networking, knowledge-sharing, and showcasing the best of what's been built using Zeta.
129 | - **Transparency Reports**:
130 | * Release regular updates about Zeta's development, challenges, successes, and roadmap.
131 | * Actively gather feedback, ensuring the community feels heard and that their insights are valued.
132 | - **Zeta Academy**:
133 | * Create a platform where developers can share their projects, tutorials, and courses about Zeta. Recognize and reward the best contributions to foster a sense of ownership and pride within the community.
134 |
135 | ---
136 |
137 | This isn't just a roadmap. It's our promise, our commitment. Because at the end of the day, it's not about the lines of code we write. It's about the lives we change, the innovations we inspire, and the future we create. And with Zeta, we believe that future is brighter than ever. Let's build it together.
138 |
139 |
140 |
--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
1 | :root {
2 | --md-primary-fg-color: #8315F9;
3 | --md-accent-fg-color: #00FFCE;
4 | }
--------------------------------------------------------------------------------
/docs/zeta/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Swarm-Corporation/Multi-Agent-Template-App/dbb3ebd78a39b698068b2d4eae4365450fa05dbe/docs/zeta/.DS_Store
--------------------------------------------------------------------------------
/docs/zeta/index.md:
--------------------------------------------------------------------------------
1 | The Zeta framework provides developers with the ability to create State of The Art Models as simply and seamlessly as possible through **Modularity**, **Reliability**, **Use-Ability**, and **Speed**
2 |
3 | Zeta not only helps developers harness the potential of LLMs and Multi-Modal Foundation Models but also enforces trust boundaries, schema validation, and tool activity-level permissions. By doing so, Zeta maximizes LLMs’ reasoning while adhering to strict policies regarding their capabilities.
4 |
5 | Zeta’s design philosophy is based on the following tenets:
6 |
7 | 1. **Use-Ability**: Utilizing Zeta should feel like going for a swim in the ocean, seamless and fluid with pythonic methods and classes and error handling that signifies what steps to take next.
8 | 2. **Reliability**: Zeta puts every FLOP to work by harnessing ultra-reliable and high-performance designs for all functions and classes
9 | 3. **Speed**: Zeta is like the Lamborghini of ML Frames with simply unparalled speed.
10 |
11 | ## Quick Starts
12 |
13 | ### Using pip
14 |
15 | Install **zeta**
16 |
17 | ```
18 | pip3 install zeta
19 | ```
20 |
21 | ## Unleash FlashAttention
22 | With Zeta, you can unleash the best and highest performance attention mechanisms like `FlashAttention` and `MultiQueryAttention`, here's an example with Flash Attention
23 |
24 | ```python
25 | import torch
26 | from zeta import FlashAttention
27 |
28 | q = torch.randn(2, 4, 6, 8)
29 | k = torch.randn(2, 4, 10, 8)
30 | v = torch.randn(2, 4, 10, 8)
31 |
32 | attention = FlashAttention(causal=False, dropout=0.1, flash=False)
33 | output = attention(q, k, v)
34 |
35 | print(output.shape)
36 | ```
37 |
38 | ## Unleash GPT-4
39 | On top of the SOTA Attention mechanisms we provide, we also provide rough implementation of some of the best neural nets ever made like `GPT4`, here's an example on how to utilize our implementation of GPT-4
40 |
41 | ```python
42 | import torch
43 | from zeta import GPT4, GPT4MultiModal
44 |
45 | #text
46 | text = torch.randint(0, 256, (1, 1024)).cuda()
47 | img = torch.randn(1, 3, 256, 256)
48 |
49 | gpt4_language = GPT4()
50 |
51 | gpt4_language(x)
52 |
53 | #multimodal GPT4
54 |
55 | gpt4_multimodal = GPT4MultiModal()
56 | gpt4_multimodal_output = gpt4_multimodal(text, img)
57 |
58 | ```
59 |
60 |
--------------------------------------------------------------------------------
/docs/zeta/nn/architecture/decoder.md:
--------------------------------------------------------------------------------
1 | # Decoder Class Documentation
2 |
3 | Module/Class Name: Decoder
4 |
5 | ```python
6 | class Decoder(AttentionLayers):
7 | def __init__(self, **kwargs):
8 | assert 'causal' not in kwargs, 'cannot set causality on decoder'
9 | super().__init__(causal=True, **kwargs)
10 | ```
11 |
12 | ## Overview and Introduction
13 |
14 | The `Decoder` class is a component of the Zeta library designed for creating a decoder model with multiple attention layers. It extends the functionality of the `AttentionLayers` class to enable the construction of a decoder architecture. The decoder is a key component in various sequence-to-sequence tasks, such as machine translation, text generation, and more.
15 |
16 | The decoder employs multi-head self-attention mechanisms and feed-forward networks to transform input sequences into meaningful output sequences while maintaining the causal property. It is particularly suitable for autoregressive tasks, where each step depends only on previous steps in the sequence.
17 |
18 | ## Class Definition
19 |
20 | ```python
21 | class Decoder(AttentionLayers):
22 | def __init__(self, **kwargs):
23 | assert 'causal' not in kwargs, 'cannot set causality on decoder'
24 | super().__init__(causal=True, **kwargs)
25 | ```
26 |
27 | The `Decoder` class inherits from the `AttentionLayers` class and introduces the causality constraint by setting `causal=True`. It is initialized with various parameters that configure the architecture and behavior of the decoder.
28 |
29 | ## Parameters
30 |
31 | The `Decoder` class constructor accepts various parameters that control the behavior of the decoder. The most important parameters are inherited from the `AttentionLayers` class, and additional parameters specific to the decoder are introduced. Below is a summary of the parameters:
32 |
33 | - `dim` (int): Dimensionality of the model.
34 | - `depth` (int): Number of decoder layers.
35 | - `heads` (int): Number of parallel attention heads.
36 | - `cross_attend` (bool): Enable cross-attention between input and output sequences.
37 | - `sandwich_coef` (int): Coefficient for configuring sandwich normalization.
38 | - `residual_attn` (bool): Enable residual connection for self-attention layers.
39 | - `cross_residual_attn` (bool): Enable residual connection for cross-attention layers.
40 | - `layer_dropout` (float): Dropout probability applied to each layer.
41 | - ... (additional parameters inherited from `AttentionLayers`)
42 |
43 | ## Functionality and Usage
44 |
45 | The `Decoder` class extends the functionality of the `AttentionLayers` class to specifically create decoder models. It employs multi-head self-attention mechanisms and feed-forward networks to process input sequences and generate output sequences.
46 |
47 | ### Initialization
48 |
49 | To create a decoder instance, you can use the following code:
50 |
51 | ```python
52 | from zeta import Decoder
53 |
54 | decoder = Decoder(
55 | dim=512,
56 | depth=6,
57 | heads=8,
58 | causal=True,
59 | cross_attend=True,
60 | residual_attn=True,
61 | layer_dropout=0.1
62 | )
63 | ```
64 |
65 | ### Forward Pass
66 |
67 | The forward pass of the decoder can be performed using the following code:
68 |
69 | ```python
70 | output = decoder(input_sequence, context=context_sequence, mask=mask_sequence, context_mask=context_mask_sequence)
71 | ```
72 |
73 | Here, `input_sequence` represents the input sequence to the decoder, `context_sequence` represents the context sequence for cross-attention (if enabled), `mask_sequence` is an optional mask to ignore certain elements in the input, and `context_mask_sequence` is an optional mask for the context sequence.
74 |
75 | ### Return Intermediates
76 |
77 | If desired, you can also obtain intermediate outputs at each layer using the `return_hiddens` parameter:
78 |
79 | ```python
80 | output, intermediates = decoder(input_sequence, context=context_sequence, mask=mask_sequence, context_mask=context_mask_sequence, return_hiddens=True)
81 | ```
82 |
83 | The `intermediates` object will contain information about intermediate hidden states and attention outputs for each layer.
84 |
85 | ## Mathematical Formula
86 |
87 | The `Decoder` class is built upon the foundation of multi-head self-attention and feed-forward networks. It can be summarized using the following mathematical formula:
88 |
89 | 1. Input Embedding: \( X \)
90 | 2. Multi-Head Self-Attention: \( A = \text{MultiHeadAttention}(X) \)
91 | 3. Feed-Forward Network: \( Y = \text{FeedForward}(A) \)
92 | 4. Residual Connection: \( Z = X + Y \)
93 |
94 | The above formula represents the basic forward pass of each layer in the decoder. The decoder iteratively applies these operations across its layers to generate meaningful output sequences while maintaining causal dependencies.
95 |
96 | ## References
97 |
98 | - [Zeta Library Documentation](https://example.com/zeta/docs)
99 | - [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
100 | - [PAR: Prompted Attention](https://arxiv.org/abs/2207.04503)
101 | ```
102 |
103 | This documentation provides an in-depth overview of the `Decoder` class in the Zeta library. It covers its purpose, parameters, usage examples, and includes a simplified mathematical formula to illustrate its functionality.
--------------------------------------------------------------------------------
/docs/zeta/nn/architecture/transformer.md:
--------------------------------------------------------------------------------
1 | # Transformer Documentation
2 |
3 | ## Overview
4 |
5 | The `Transformer` class in the Zeta library is a versatile deep learning architecture that combines attention mechanisms with feedforward neural networks for various natural language processing tasks, such as language modeling, machine translation, and text generation. The Transformer architecture was introduced in the paper "Attention is All You Need" by Vaswani et al.
6 |
7 | The main purpose of the `Transformer` class is to provide a flexible and configurable interface for creating transformer-based models for sequence-to-sequence tasks. The class allows users to specify the number of tokens, maximum sequence length, attention layers, embeddings, and other parameters necessary for creating and training transformer models.
8 |
9 | The Transformer class supports both autoregressive and non-autoregressive training settings and includes features such as relative positional biases, rotary positional embeddings, memory tokens, and more.
10 |
11 | ## Class Signature
12 |
13 | ```python
14 | class Transformer(nn.Module):
15 | def __init__(
16 | self,
17 | *,
18 | num_tokens,
19 | max_seq_len,
20 | attn_layers,
21 | embedding_provider: BaseEmbedding,
22 | emb_dim = None,
23 | max_mem_len = 0.,
24 | shift_mem_down = 0,
25 | emb_dropout = 0.,
26 | post_emb_norm = False,
27 | num_memory_tokens = None,
28 | tie_embedding = False,
29 | logits_dim = None,
30 | use_abs_pos_emb = True,
31 | scaled_sinu_pos_emb = False,
32 | l2norm_embed = False,
33 | emb_frac_gradient = 1.
34 | )
35 | ```
36 |
37 | ## Parameters
38 |
39 | - `num_tokens` (int): The total number of tokens in the vocabulary.
40 | - `max_seq_len` (int): The maximum length of the input sequences.
41 | - `attn_layers` (AttentionLayers): An instance of the `AttentionLayers` class representing the core attention layers of the transformer.
42 | - `embedding_provider` (BaseEmbedding): An instance of the `BaseEmbedding` class providing token embeddings.
43 | - `emb_dim` (int, optional): The embedding dimension. Default is `None`, in which case `emb_dim` is set to the same dimension as the `attn_layers`.
44 | - `max_mem_len` (float, optional): Maximum memory length for memory tokens. Default is `0.0`, indicating no memory tokens.
45 | - `shift_mem_down` (int, optional): Number of positions to shift memory tokens down in each layer. Default is `0`.
46 | - `emb_dropout` (float, optional): Dropout rate applied to the embedding layer. Default is `0.0`.
47 | - `post_emb_norm` (bool, optional): Apply layer normalization to the post-embedding inputs. Default is `False`.
48 | - `num_memory_tokens` (int, optional): Number of memory tokens to use. Default is `None`, indicating no memory tokens.
49 | - `tie_embedding` (bool, optional): Tie the output projection weights with the input token embeddings. Default is `False`.
50 | - `logits_dim` (int, optional): Dimensionality of the output logits. Default is `None`, indicating that it's the same as `num_tokens`.
51 | - `use_abs_pos_emb` (bool, optional): Use absolute positional embeddings. Default is `True`.
52 | - `scaled_sinu_pos_emb` (bool, optional): Use scaled sinusoidal positional embeddings. Default is `False`.
53 | - `l2norm_embed` (bool, optional): Apply L2 normalization to the embeddings. Default is `False`.
54 | - `emb_frac_gradient` (float, optional): Fraction of the gradient that should go to the embedding. Default is `1.0`.
55 |
56 | ## Methods
57 |
58 | ### `forward`
59 |
60 | ```python
61 | def forward(
62 | self,
63 | x,
64 | return_embeddings = False,
65 | return_logits_and_embeddings = False,
66 | return_intermediates = False,
67 | mask = None,
68 | return_mems = False,
69 | return_attn = False,
70 | mems = None,
71 | pos = None,
72 | prepend_embeds = None,
73 | sum_embeds = None,
74 | **kwargs
75 | )
76 | ```
77 |
78 | This method computes the forward pass of the transformer.
79 |
80 | #### Parameters
81 |
82 | - `x` (torch.Tensor): Input tensor representing the sequence of token indices.
83 | - `return_embeddings` (bool, optional): If `True`, return only the embeddings without applying the output projection. Default is `False`.
84 | - `return_logits_and_embeddings` (bool, optional): If `True`, return both the logits and embeddings. Default is `False`.
85 | - `return_intermediates` (bool, optional): If `True`, return intermediate attention values. Default is `False`.
86 | - `mask` (torch.Tensor, optional): Attention mask indicating positions to be masked. Default is `None`.
87 | - `return_mems` (bool, optional): If `True`, return updated memory tokens. Default is `False`.
88 | - `return_attn` (bool, optional): If `True`, return attention maps. Default is `False`.
89 | - `mems` (list of torch.Tensor, optional): Memory tokens for each layer. Default is `None`.
90 | - `pos` (torch.Tensor, optional): External positional embeddings. Default is `None`.
91 | - `prepend_embeds` (torch.Tensor, optional): Prepend embeddings to the input sequence. Default is `None`.
92 | - `sum_embeds` (torch.Tensor, optional): Sum external embeddings to the input sequence. Default is `None`.
93 | - `kwargs`: Additional keyword arguments passed to the attention layers.
94 |
95 | #### Returns
96 |
97 | The method returns the output logits or embeddings based on the specified return options.
98 |
99 | ## Usage Examples
100 |
101 | Here are three usage examples of the `Transformer` class from the Zeta library:
102 |
103 | ```python
104 | from zeta.nn import Transformer
105 |
106 | # Example 1: Basic Usage
107 | transformer = Transformer(
108 | num_tokens=10000,
109 | max_seq_len=256,
110 | attn_layers=attn_layers_instance,
111 | embedding_provider=embedding_provider_instance
112 | )
113 | logits = transformer(input_tokens)
114 |
115 | # Example 2: Return Embeddings
116 | embeddings = transformer(input_tokens, return_embeddings=True)
117 |
118 | # Example 3: Return Intermediate Attention Maps
119 | logits, attn_maps = transformer(input_tokens, return_attn=True)
120 | ```
121 |
122 | In these examples, replace `attn_layers_instance` and `embedding_provider_instance` with actual instances of `AttentionLayers` and `BaseEmbedding`, respectively, and `input_tokens` with your input tensor containing token indices.
123 |
124 | ## Mathematical Formula
125 |
126 | The mathematical formula for the `Transformer` class can be represented as follows:
127 |
128 | ```
129 | Input -> Embedding -> Post-embedding Norm -> Embedding Dropout -> Project Embedding -> Attention Layers -> Layer Normalization -> To Logits/Embeddings
130 | ```
131 |
132 | In this formula, "Attention Layers" represents the core attention mechanism of the transformer, which includes self-attention and feedforward neural networks.
133 |
134 | ## References
135 |
136 | - Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is All You Need. Advances in neural information processing systems, 30.
137 | - Zeta Library: Link to the official documentation of the Zeta library.
138 | - Insert any additional references or resources as needed.
139 | ```
140 |
141 |
--------------------------------------------------------------------------------
/docs/zeta/nn/attention/base.md:
--------------------------------------------------------------------------------
1 | # BaseAttention Abstract Class
2 | ============================
3 |
4 | The `BaseAttention` class is an abstract base class that defines the interface for all attention mechanisms. It includes the basic structure and methods that all attention mechanisms should have.
5 |
6 | ```python
7 | from abc import abstractmethod
8 | import torch.nn as nn
9 |
10 | class BaseAttention(nn.Module):
11 | @abstractmethod
12 | def __init__(self, dim):
13 | super().__init__()
14 | self.dim = dim
15 |
16 |
17 | @abstractmethod
18 | def forward(self, x, context=None, mask=None):
19 | pass
20 | ```
21 |
22 |
23 | ## Usage
24 | -----------------------
25 |
26 | The `FlashAttentionTwo` class extends the `BaseAttention` abstract base class and implements the specific attention mechanism.
27 |
28 | ```python
29 | class FlashAttentionTwo(BaseAttention):
30 | def __init__(
31 | self,
32 | *,
33 | dim,
34 | heads = 8,
35 | dim_head = 64,
36 | causal = False,
37 | q_bucket_size = 512,
38 | k_bucket_size = 1024,
39 | parallel = False,
40 | mixed_precision = False
41 | ):
42 | super().__init__(dim, heads, dim_head)
43 | self.causal = causal
44 | self.parallel = parallel
45 | self.mixed_precision = mixed_precision
46 | self.q_bucket_size = q_bucket_size
47 | self.k_bucket_size = k_bucket_size
48 | # ... rest of the implementation ...
49 |
50 | def forward(
51 | self,
52 | x,
53 | context = None,
54 | mask = None,
55 | q_bucket_size = None,
56 | k_bucket_size = None,
57 | ):
58 | # ... implementation of the forward method ...
59 | ```
60 |
61 |
62 | ## Rules for Using the BaseAttention Class
63 | ---------------------------------------
64 |
65 | 1. Any class that extends the `BaseAttention` class must implement the `forward` method. This method defines how the attention mechanism operates.
66 |
67 | 2. The `__init__` method of the `BaseAttention` class takes three parameters: `dim`, `heads`, and `dim_head`. Any class that extends `BaseAttention` should pass these parameters to the `__init__` method of the base class.
68 |
69 | 3. The `forward` method of the `BaseAttention` class takes three parameters: `x`, `context`, and `mask`. Any class that extends `BaseAttention` should include these parameters in its `forward` method.
70 |
71 | ---
72 |
73 | ## Example of Using the FlashAttentionTwo Class
74 | --------------------------------------------
75 |
76 | ```python
77 | from zeta import FlashAttentionTwo
78 |
79 | # Create an instance of the FlashAttentionTwo class
80 | attention = FlashAttentionTwo(dim=512, heads=8, dim_head=64)
81 |
82 | # Create some input data
83 | x = torch.randn(1, 10, 512)
84 |
85 | # Apply the attention mechanism
86 | out = attention(x)
87 | ```
88 |
89 |
90 | In this example, we first create an instance of the `FlashAttentionTwo` class. We then create some input data `x` and apply the attention mechanism to this data by calling the `forward` method of the `attention` instance.
--------------------------------------------------------------------------------
/docs/zeta/nn/attention/flash2.md:
--------------------------------------------------------------------------------
1 | # Module Name: FlashAttentionTwo
2 |
3 | The `FlashAttentionTwo` class is a PyTorch module that implements a variant of the attention mechanism, which is a key component in many state-of-the-art models in natural language processing and other fields. This class is designed to be memory-efficient and optionally supports parallel computation and mixed precision for improved performance.
4 |
5 | ## Class Definition
6 | ----------------
7 |
8 | ```python
9 | class FlashAttentionTwo(nn.Module):
10 | def __init__(
11 | self,
12 | *,
13 | dim,
14 | heads = 8,
15 | dim_head = 64,
16 | causal = False,
17 | q_bucket_size = 512,
18 | k_bucket_size = 1024,
19 | parallel = False,
20 | mixed_precision = False
21 | ):
22 | ```
23 |
24 | ---
25 |
26 | ### Parameters
27 |
28 | - `dim` (int): The dimensionality of the input data.
29 | - `heads` (int, optional): The number of attention heads. Default is 8.
30 | - `dim_head` (int, optional): The dimensionality of each attention head. Default is 64.
31 | - `causal` (bool, optional): If True, the attention mechanism is causal. Default is False.
32 | - `q_bucket_size` (int, optional): The bucket size for the query in the attention mechanism. Default is 512.
33 | - `k_bucket_size` (int, optional): The bucket size for the key in the attention mechanism. Default is 1024.
34 | - `parallel` (bool, optional): If True, the computation is performed in parallel across multiple GPUs. Default is False.
35 | - `mixed_precision` (bool, optional): If True, the computation is performed in mixed precision for improved performance. Default is False.
36 |
37 | -----
38 |
39 | ### Methods
40 |
41 | #### `forward`
42 |
43 | ```
44 | def forward(
45 | self,
46 | x,
47 | context = None,
48 | mask = None,
49 | q_bucket_size = None,
50 | k_bucket_size = None,
51 | ):
52 | ```
53 |
54 | Performs the forward pass of the attention mechanism.
55 |
56 | ##### Parameters
57 |
58 | - `x` (Tensor): The input data.
59 | - `context` (Tensor, optional): The context for the attention mechanism. If not provided, the input data `x` is used as the context.
60 | - `mask` (Tensor, optional): An optional mask for the attention mechanism.
61 | - `q_bucket_size` (int, optional): The bucket size for the query in the attention mechanism. If not provided, the value specified during initialization is used.
62 | - `k_bucket_size` (int, optional): The bucket size for the key in the attention mechanism. If not provided, the value specified during initialization is used.
63 |
64 | ---
65 |
66 | ##### Returns
67 |
68 | - `out` (Tensor): The output of the attention mechanism.
69 |
70 |
71 | ## Usage Examples
72 | --------------
73 |
74 | ### Example 1: Basic Usage
75 |
76 | ```python
77 | from torch import nn
78 | from zeta import FlashAttentionTwo
79 |
80 | model = FlashAttentionTwo(dim=512)
81 | x = torch.randn(1, 10, 512)
82 | out = model(x)
83 | ```
84 |
85 | Copy code
86 |
87 | ### Example 2: Using a Mask
88 |
89 | ```python
90 | from torch import nn
91 | from zeta import FlashAttentionTwo
92 |
93 | model = FlashAttentionTwo(dim=512)
94 | x = torch.randn(1, 10, 512)
95 | mask = torch.ones(1, 10)
96 | out = model(x, mask=mask)
97 | ```
98 |
99 | ----
100 |
101 | ### Example 3: Using a Context
102 |
103 | ```python
104 | from torch import nn
105 | from zeta import FlashAttentionTwo
106 |
107 | model = FlashAttentionTwo(dim=512)
108 | x = torch.randn(1, 10, 512)
109 | context = torch.randn(1, 10, 512)
110 | out = model(x, context=context)
111 | ```
112 |
113 |
114 | ## Mathematical Formula
115 | --------------------
116 |
117 | The attention mechanism can be described by the following formula:
118 |
119 | 
120 |
121 | where Q, K, and V are the query, key, and value, respectively. The softmax function ensures that the weights sum to 1, and the dot product of the weights and the value gives the output of the attention mechanism.
122 |
123 |
124 | ### Additional Information
125 | ----------------------
126 |
127 | The `FlashAttentionTwo` class is designed to be memory-efficient and optionally supports parallel computation and mixed precision for improved performance.
128 |
129 | - The `parallel` parameter allows the computation to be performed in parallel across multiple GPUs. This can significantly speed up the computation for large models or large datasets.
130 |
131 | - The `mixed_precision` parameter allows the computation to be performed in mixed precision. This means that some operations are performed in lower precision (e.g., float16) and some in higher precision (e.g., float32). This can significantly speed up the computation and reduce memory usage on modern GPUs that support mixed precision.
132 |
133 | - The `q_bucket_size` and `k_bucket_size` parameters control the bucket size for the query and key in the attention mechanism, respectively. These parameters can be used to trade off between memory usage and computational efficiency. Larger bucket sizes can be more memory-efficient but may also be slower.
134 |
135 | ### Common Issues
136 | -------------
137 |
138 | - If you encounter out-of-memory errors, you can try reducing the `q_bucket_size` and `k_bucket_size` parameters, or enabling mixed precision computation by setting `mixed_precision=True`.
139 |
140 | - If you encounter slow computation, you can try increasing the `q_bucket_size` and `k_bucket_size` parameters, or enabling parallel computation by setting `parallel=True` (if you have multiple GPUs available).
141 |
142 | ### References and Resources
143 | ------------------------
144 |
145 | - [Attention Is All You Need](https://arxiv.org/abs/1706.03762): This is the original paper that introduced the concept of attention in deep learning.
146 |
147 | - [PyTorch Documentation](https://pytorch.org/docs/stable/index.html): The official PyTorch documentation provides detailed information about the PyTorch library and its modules.
148 |
149 | - [Efficient Attention: Attention with Linear Complexities](https://arxiv.org/abs/1812.01243): This paper introduces the concept of bucketing in the attention mechanism to improve memory efficiency.
150 |
151 | - [Mixed Precision Training](https://arxiv.org/abs/1710.03740): This paper introduces the concept of mixed precision training, which can significantly speed up computation and reduce memory usage on modern GPUs.
152 |
153 | - [PyTorch Tutorials](https://pytorch.org/tutorials/): The official PyTorch tutorials provide many examples of how to use PyTorch for various tasks.
154 |
155 | -
--------------------------------------------------------------------------------
/docs/zeta/nn/attention/flash_attention.md:
--------------------------------------------------------------------------------
1 | # FlashAttention
2 |
3 | The FlashAttention module performs efficient attention computations, specifically designed for leveraging hardware capabilities on certain NVIDIA GPUs. It offers the option to perform "flash" attention which can be computationally faster on specific GPU architectures.
4 |
5 | ---
6 |
7 | ## Class Definition:
8 |
9 | ```python
10 | class FlashAttention(nn.Module):
11 | ```
12 |
13 | ### Parameters:
14 |
15 | - `causal` (bool, optional): Determines whether to apply causal masking. Default: False.
16 | - `dropout` (float, optional): Dropout probability. Default: 0.
17 | - `flash` (bool, optional): Whether to use flash attention. Requires PyTorch version 2.0 or above. Default: True.
18 |
19 | ---
20 |
21 | ## Methods:
22 |
23 | ### `__init__(self, causal=False, dropout=0., flash=True)`
24 |
25 | Initializes the FlashAttention module.
26 |
27 | ### `get_mask(self, i, j, device)`
28 |
29 | Generates a mask for attention computation.
30 |
31 | #### Parameters:
32 | - `i` (int): Length of the query sequence.
33 | - `j` (int): Length of the key sequence.
34 | - `device` (torch.device): Device to place the mask tensor.
35 |
36 | #### Returns:
37 | - `torch.Tensor`: Mask tensor of shape `(i, j)`.
38 |
39 | ### `flash_attn(self, q, k, v, mask=None, attn_bias=None)`
40 |
41 | Performs flash attention computation.
42 |
43 | #### Parameters:
44 | - `q` (torch.Tensor): Query tensor of shape `(batch, heads, q_len, dim)`.
45 | - `k` (torch.Tensor): Key tensor of shape `(batch, heads, k_len, dim)`.
46 | - `v` (torch.Tensor): Value tensor of shape `(batch, heads, v_len, dim)`.
47 | - `mask` (torch.Tensor, optional): Mask tensor of shape `(batch, heads, q_len, k_len)`. Default: None.
48 | - `attn_bias` (torch.Tensor, optional): Attention bias tensor of shape `(batch, heads, q_len, k_len)`. Default: None.
49 |
50 | #### Returns:
51 | - `torch.Tensor`: Output tensor of shape `(batch, heads, q_len, dim)`.
52 |
53 | ### `forward(self, q, k, v, mask=None, attn_bias=None)`
54 |
55 | Performs the attention computation using einstein notation.
56 |
57 | #### Parameters:
58 | - `q` (torch.Tensor): Query tensor of shape `(batch, heads, q_len, dim)`.
59 | - `k` (torch.Tensor): Key tensor of shape `(batch, heads, k_len, dim)`.
60 | - `v` (torch.Tensor): Value tensor of shape `(batch, heads, v_len, dim)`.
61 | - `mask` (torch.Tensor, optional): Mask tensor of shape `(batch, heads, q_len, k_len)`. Default: None.
62 | - `attn_bias` (torch.Tensor, optional): Attention bias tensor of shape `(batch, heads, q_len, k_len)`. Default: None.
63 |
64 | #### Returns:
65 | - `torch.Tensor`: Attention output tensor.
66 |
67 | ---
68 |
69 | ## Usage Examples:
70 |
71 | 1. **Basic Usage**:
72 | ```python
73 | from zeta.nn import FlashAttention
74 | attn_module = FlashAttention()
75 | output = attn_module(query_tensor, key_tensor, value_tensor)
76 | ```
77 |
78 | 2. **Using Flash Attention with Masking**:
79 | ```python
80 | from zeta.nn import FlashAttention
81 | attn_module = FlashAttention(flash=True)
82 | mask = attn_module.get_mask(query_length, key_length, device)
83 | output = attn_module(query_tensor, key_tensor, value_tensor, mask=mask)
84 | ```
85 |
86 | 3. **Using Causal Flash Attention with Dropout**:
87 | ```python
88 | from zeta.nn import FlashAttention
89 | attn_module = FlashAttention(causal=True, dropout=0.1, flash=True)
90 | output = attn_module(query_tensor, key_tensor, value_tensor)
91 | ```
92 |
93 | ---
94 |
95 | ## Additional Tips:
96 |
97 | - The `FlashAttention` module is optimized for NVIDIA A100 GPUs. On these GPUs, using `flash=True` is recommended for faster computation.
98 | - Ensure that PyTorch version is 2.0 or above when enabling flash attention.
99 | - The mask generated using `get_mask` method is useful for attention computations where certain positions need to be masked out.
100 |
101 | ---
102 |
103 | ## References:
104 |
105 | - Original Attention Mechanism: [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
--------------------------------------------------------------------------------
/docs/zeta/nn/attention/multihead.md:
--------------------------------------------------------------------------------
1 | # Multihead Attention Documentation for Zeta Library
2 |
3 | ## Introduction
4 |
5 | `MultiheadAttention` is a module in the Zeta library that provides multi-head attention mechanism. This mechanism enables the model to focus on different parts of the input sequence simultaneously. It's widely used in models such as transformers for capturing various aspects of information in the input.
6 |
7 | ## Purpose
8 |
9 | The purpose of the `MultiheadAttention` module is to allow joint information representation from different subspaces of the input sequence. This results in capturing a richer context when modeling sequences.
10 |
11 | ## Architecture
12 |
13 | The `MultiheadAttention` class extends from the `nn.Module` base class. Internally, it uses linear transformations for keys, values, and queries (`k_proj`, `v_proj`, `q_proj`). These projections are wrapped using the `MultiwayWrapper`. It also utilizes layer normalization (`inner_attn_ln`) and optionally uses relative positional embeddings (`xpos`).
14 |
15 | ## Class Definition
16 |
17 | ```python
18 | class zeta.nn.embeddings.MultiheadAttention(nn.Module):
19 | ```
20 |
21 | ### Parameters:
22 | - `args`: General arguments passed for configuring the module.
23 | - `embed_dim` (int): Total dimension of the model.
24 | - `num_heads` (int): Number of parallel attention heads. The embed_dim will be split across num_heads.
25 | - `dropout` (float): Dropout probability. Default: 0.0.
26 | - `self_attention` (bool): Whether to apply self attention. Only one of `self_attention` or `encoder_decoder_attention` can be True. Default: False.
27 | - `encoder_decoder_attention` (bool): Whether to apply encoder-decoder attention. Only one of `self_attention` or `encoder_decoder_attention` can be True. Default: False.
28 | - `subln` (bool): If True, applies layer normalization after self attention. Default: False.
29 |
30 | ### Methods:
31 |
32 | #### `reset_parameters()`
33 | Reinitialize the parameters of the attention module.
34 |
35 | #### `forward(query, key, value, ...)`
36 | Computes the forward pass of the attention mechanism.
37 |
38 | - Parameters:
39 | - `query` (Tensor): The query tensor.
40 | - `key` (Tensor): The key tensor.
41 | - `value` (Tensor): The value tensor.
42 | - Other arguments including `incremental_state`, `key_padding_mask`, `attn_mask`, `rel_pos`, and `is_first_step`.
43 |
44 | - Returns:
45 | - `attn` (Tensor): The computed attention tensor.
46 | - `attn_weights` (Tensor): The attention weights.
47 |
48 | ### Mathematical Formulation:
49 |
50 | Given a query \( Q \), key \( K \), and value \( V \), the multihead attention mechanism is mathematically represented as:
51 |
52 | \[ \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right) V \]
53 |
54 | Where \( d_k \) is the dimension of the key.
55 |
56 | ## Usage Examples:
57 |
58 | ### Example 1: Basic Usage
59 |
60 | ```python
61 | from zeta.nn.embeddings import MultiheadAttention
62 | import torch
63 |
64 | args = ... # Some configuration
65 | attention = MultiheadAttention(args, embed_dim=512, num_heads=8, dropout=0.1, self_attention=True)
66 | query = torch.rand((32, 10, 512))
67 | key = torch.rand((32, 10, 512))
68 | value = torch.rand((32, 10, 512))
69 |
70 | attn, attn_weights = attention(query, key, value)
71 | ```
72 |
73 | ### Example 2: With Masking
74 |
75 | ```python
76 | from zeta.nn.embeddings import MultiheadAttention
77 | import torch
78 |
79 | args = ... # Some configuration
80 | attention = MultiheadAttention(args, embed_dim=512, num_heads=8, dropout=0.1, self_attention=True)
81 | query = torch.rand((32, 10, 512))
82 | key = torch.rand((32, 10, 512))
83 | value = torch.rand((32, 10, 512))
84 | attn_mask = torch.ones((10, 10)).triu_() * -1e9 # Upper triangular mask
85 |
86 | attn, attn_weights = attention(query, key, value, attn_mask=attn_mask)
87 | ```
88 |
89 | ### Example 3: Encoder-Decoder Attention
90 |
91 | ```python
92 | from zeta.nn.embeddings import MultiheadAttention
93 | import torch
94 |
95 | args = ... # Some configuration
96 | attention = MultiheadAttention(args, embed_dim=512, num_heads=8, dropout=0.1, encoder_decoder_attention=True)
97 | query = torch.rand((32, 10, 512)) # Decoder query
98 | key = torch.rand((32, 20, 512)) # Encoder key
99 | value = torch.rand((32, 20, 512)) # Encoder value
100 |
101 | attn, attn_weights = attention(query, key, value)
102 | ```
103 |
104 | ## Additional Tips:
105 | - For encoder-decoder attention, make sure the dimensions of the encoder and decoder tensors match the expected input sizes.
106 | - Using masks can be helpful to prevent the attention mechanism from focusing on certain parts of the sequence, such as padding.
107 |
--------------------------------------------------------------------------------
/docs/zeta/nn/attention/multiquery.md:
--------------------------------------------------------------------------------
1 | # MultiQueryAttention
2 |
3 | ## Overview and Introduction:
4 |
5 | The `MultiQueryAttention` class is a part of the Zeta library, designed to perform self-attention operations on given input data. Unlike traditional attention mechanisms that use a single query, this class leverages multiple queries to capture a broader range of context information. This class allows for various implementations of attention, including Flash, Triton, and Torch. It also provides the flexibility to choose normalization type, fully connected layer type, and offers debugging verbosity.
6 |
7 | ## Class Definition:
8 |
9 | ```python
10 | class MultiQueryAttention(nn.Module):
11 | """Multi-Query self attention.
12 | Using torch or triton attention implementation enables the user to also use
13 | additive bias.
14 | """
15 | ```
16 |
17 | ### Parameters:
18 | - `d_model` (int): Dimension of the model.
19 | - `heads` (int): Number of parallel attention heads.
20 | - `attn_impl` (str, optional): Attention implementation type, can be either 'triton', 'flash', or 'torch'. Default is 'triton'.
21 | - `clip_qkv` (Optional[float]): Clipping value for query, key, and value. If specified, qkv is clamped within the range [-clip_qkv, clip_qkv].
22 | - `qk_ln` (bool, optional): If True, layer normalization is applied to query and key.
23 | - `softmax_scale` (Optional[float]): Scale for softmax. Default value is computed as 1/sqrt(head_dim).
24 | - `attn_pdrop` (float, optional): Attention dropout probability. Default is 0.0.
25 | - `norm_type` (str, optional): Normalization type, default is 'low_precision_layernorm'.
26 | - `fc_type` (str, optional): Fully connected layer type, default is 'torch'.
27 | - `verbose` (int, optional): Verbosity level, default is 0.
28 | - `device` (Optional[str]): Device to which the tensors should be moved.
29 |
30 | ## Functionality and Usage:
31 |
32 | The `MultiQueryAttention` class operates by using multiple queries to capture broader context information from given data. This is achieved through the forward method which computes the self-attention on the given inputs.
33 |
34 | ### Method: `forward`
35 | ```python
36 | def forward(
37 | self,
38 | x,
39 | past_key_value=None,
40 | bias=None,
41 | mask=None,
42 | causal=True,
43 | needs_weights=False,
44 | ):
45 | ```
46 |
47 | #### Parameters:
48 |
49 | - `x` (Tensor): Input tensor.
50 | - `past_key_value` (Optional): Past key and value for attention computation. Default is None.
51 | - `bias` (Optional): Additive bias for attention scores. Default is None.
52 | - `mask` (Optional): Key padding mask. Default is None.
53 | - `causal` (bool, optional): If True, a causal mask is applied to prevent information flow from future tokens. Default is True.
54 | - `needs_weights` (bool, optional): If True, attention weights are also returned. Default is False.
55 |
56 | #### Returns:
57 |
58 | - `context` (Tensor): Contextualized tensor after attention computation.
59 | - `attn_weights` (Tensor, Optional): Attention weights. Only returned if `needs_weights` is True.
60 | - `past_key_value` (Tensor, Optional): New past key and value.
61 |
62 | ## Usage Examples:
63 |
64 | 1. Basic Usage:
65 | ```python
66 | from zeta import MultiQueryAttention
67 | import torch
68 |
69 | # Initialize the attention module
70 | attention_layer = MultiQueryAttention(d_model=512, heads=8, attn_impl='torch')
71 |
72 | # Random input tensor
73 | x = torch.rand(16, 10, 512) # Batch of 16, sequence length 10, embedding size 512
74 | output, attn_weights, _ = attention_layer(x)
75 | ```
76 |
77 | 2. Using Past Key and Value:
78 | ```python
79 | past_key_value = (torch.rand(16, 8, 10, 64), torch.rand(16, 8, 10, 64)) # Past key and value for 8 heads
80 | output, attn_weights, new_past_key_value = attention_layer(x, past_key_value=past_key_value)
81 | ```
82 |
83 | 3. With Causal Masking and Weights:
84 | ```python
85 | output, attn_weights, _ = attention_layer(x, causal=True, needs_weights=True)
86 | ```
87 |
88 | ## Mathematical Formula:
89 |
90 | For the self-attention mechanism, the computation involves using multiple queries (\( Q \)), keys (\( K \)), and values (\( V \)):
91 |
92 | ```latex
93 | \[ \text{Attention}(Q, K, V) = \text{Softmax}\left(\frac{Q \times K^T}{\sqrt{d_k}} + \text{Bias}\right) \times V \]
94 | ```
95 | Where:
96 | - \( Q \), \( K \), and \( V \) are the queries, keys, and values respectively.
97 | - \( d_k \) is the dimension of the keys.
98 | - Bias is the optional additive bias.
99 |
100 | ## Additional Information and Tips:
101 |
102 | - It's crucial to select the correct attention implementation (`attn_impl`) based on your needs and the hardware you're running on.
103 | - The `triton` implementation might be faster than `flash` but can use more memory. Ensure that you have adequate GPU memory if using `triton`.
104 | - If using the `torch` implementation, it's advisable to check if CUDA is available for GPU acceleration.
105 | - The clipping of qkv (`clip_qkv`) can be beneficial for stability in training.
106 |
107 | ## References and Resources:
108 | For a deeper understanding of the self-attention mechanism and its variants, you can refer to the "Attention is All You Need" paper by Vaswani et al., 2017.
--------------------------------------------------------------------------------
/docs/zeta/nn/biases/alibi.md:
--------------------------------------------------------------------------------
1 | # AlibiPositionalBias Documentation
2 |
3 | ## Introduction
4 |
5 | The `AlibiPositionalBias` module belongs to the zeta library and plays a crucial role in handling positional bias for multi-head attention mechanisms. Specifically, it attempts to alleviate the absolute positional bias based on the number of attention heads.
6 |
7 | ## Class Definition:
8 |
9 | ```python
10 | class AlibiPositionalBias(nn.Module):
11 | ```
12 |
13 | ### Parameters:
14 | - **heads** (`int`): Number of attention heads for which the slopes need to be calculated.
15 | - **total_heads** (`int`): Total number of attention heads in the network.
16 |
17 | ### Attributes:
18 | - **slopes** (`Tensor`): Tensor containing slope values, which are computed based on the number of heads.
19 | - **bias** (`Tensor` or `None`): Tensor for storing positional bias values. If not initialized or needs recomputation, it would be None.
20 |
21 | ### Methods:
22 | #### `__init__(self, heads, total_heads, **kwargs) -> None`:
23 | Initializes the `AlibiPositionalBias` module.
24 |
25 | #### `get_bias(self, i, j, device) -> Tensor`:
26 | Computes the positional bias for given dimensions i and j.
27 |
28 | - **Parameters**:
29 | - **i** (`int`): One dimension of the required positional bias.
30 | - **j** (`int`): Second dimension of the required positional bias.
31 | - **device** (`torch.device`): The device on which computations are to be performed.
32 |
33 | #### `_get_slopes(heads) -> List[float]`:
34 | A static method that calculates slopes based on the number of attention heads.
35 |
36 | - **Parameters**:
37 | - **heads** (`int`): Number of attention heads.
38 |
39 | #### `forward(self, i, j) -> Tensor`:
40 | Computes or retrieves the bias tensor for given dimensions.
41 |
42 | - **Parameters**:
43 | - **i** (`int`): One dimension for the required positional bias.
44 | - **j** (`int`): Second dimension for the required positional bias.
45 |
46 | ## Mathematical Formula:
47 |
48 | Given `n` attention heads, the alibi positional bias can be represented as:
49 |
50 | \[ \text{Bias} = \text{-abs}(j_{\text{range}}) \times \text{slope} \]
51 |
52 | Where:
53 | - \( j_{\text{range}} \) is an array of numbers from `0` to `j-1`.
54 | - `slope` is computed based on the number of heads using `_get_slopes` method.
55 |
56 | ## Usage Examples:
57 |
58 | ### Example 1: Initialize and compute bias
59 | ```python
60 | from zeta import AlibiPositionalBias
61 | import torch
62 |
63 | bias_module = AlibiPositionalBias(heads=4, total_heads=8)
64 | bias = bias_module(10, 10)
65 | print(bias)
66 | ```
67 |
68 | ### Example 2: Retrieve stored bias
69 | ```python
70 | bias = bias_module(5, 5)
71 | print(bias)
72 | ```
73 |
74 | ### Example 3: Computing bias for different dimensions
75 | ```python
76 | bias = bias_module(8, 15)
77 | print(bias)
78 | ```
79 |
80 | ## Note:
81 |
82 | - It's crucial to ensure that the `total_heads` parameter is always greater than or equal to the `heads` parameter during initialization.
83 | - The device property is internally used to determine the computation device based on the registered buffers.
84 |
85 | ## References:
86 |
87 | For a deeper understanding and applications of positional bias in attention mechanisms, one may refer to the foundational paper on Transformer architectures:
88 | - [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
89 |
90 | Also, the `einops` library provides a versatile interface for tensor manipulations. More details can be found at its official [documentation](https://einops.rocks/).
--------------------------------------------------------------------------------
/docs/zeta/nn/biases/relative_bias.md:
--------------------------------------------------------------------------------
1 | # RelativePositionBias
2 |
3 | `RelativePositionBias` is a specialized PyTorch module designed to generate relative position biases, which can be vital for certain attention mechanisms in deep learning architectures. This module quantizes the distance between two positions into a certain number of buckets and then uses an embedding to get the relative position bias. This mechanism aids in the attention mechanism by providing biases based on relative positions between the query and key, rather than relying solely on their absolute positions.
4 |
5 | ## Architecture:
6 | The architecture can be visualized in three major steps:
7 | 1. **Bucketing:** Convert relative distances between two positions into bucket indices.
8 | 2. **Embedding:** Use the bucket indices to get embeddings for each pair of positions.
9 | 3. **Computing Bias:** Computes the bias values based on the embeddings.
10 |
11 | ## Purpose:
12 | In the context of attention mechanisms, especially the transformer-based architectures, the position of tokens can provide valuable information. The `RelativePositionBias` class helps introduce this information in a compact form by bucketing relative positions and then embedding them to serve as biases for the attention scores.
13 |
14 | ## Mathematical Formula:
15 | Given a relative position \( r \), the bucket index \( b \) is computed as:
16 | \[ b =
17 | \begin{cases}
18 | n + \text{num_buckets} \div 2 & \text{if } n < 0 \text{ and bidirectional is True} \\
19 | \min\left( \max_{\text{exact}} + \left(\frac{\log(\frac{n}{\max_{\text{exact}}})}{\log(\frac{\text{max_distance}}{\max_{\text{exact}}})} \times (\text{num_buckets} - \max_{\text{exact}})\right), \text{num_buckets} - 1 \right) & \text{otherwise}
20 | \end{cases}
21 | \]
22 | Where \( n \) is the negative of the relative position, and \( \max_{\text{exact}} \) is \( \text{num_buckets} \div 2 \).
23 |
24 | ## Class Definition:
25 |
26 | ```python
27 | class RelativePositionBias(nn.Module):
28 | """
29 | Compute relative position bias which can be utilized in attention mechanisms.
30 |
31 | Parameters:
32 | - bidirectional (bool): If True, considers both forward and backward relative positions. Default: True.
33 | - num_buckets (int): Number of buckets to cluster relative position distances. Default: 32.
34 | - max_distance (int): Maximum distance to be considered for bucketing. Distances beyond this will be mapped to the last bucket. Default: 128.
35 | - n_heads (int): Number of attention heads. Default: 12.
36 | """
37 | ```
38 |
39 | ### Key Methods:
40 | - **_relative_position_bucket**: This static method is responsible for converting relative positions into bucket indices.
41 | - **compute_bias**: Computes the relative position bias for given lengths of queries and keys.
42 | - **forward**: Computes and returns the relative position biases for a batch.
43 |
44 | ## Usage Examples:
45 |
46 | ```python
47 | from zeta import RelativePositionBias
48 | import torch
49 |
50 | # Initialize the RelativePositionBias module
51 | rel_pos_bias = RelativePositionBias()
52 |
53 | # Example 1: Compute bias for a single batch
54 | bias_matrix = rel_pos_bias(1, 10, 10)
55 |
56 | # Example 2: Utilize in conjunction with an attention mechanism
57 | # NOTE: This is a mock example, and may not represent an actual attention mechanism's complete implementation.
58 | class MockAttention(nn.Module):
59 | def __init__(self):
60 | super().__init__()
61 | self.rel_pos_bias = RelativePositionBias()
62 |
63 | def forward(self, queries, keys):
64 | bias = self.rel_pos_bias(queries.size(0), queries.size(1), keys.size(1))
65 | # Further computations with bias in the attention mechanism...
66 | return None # Placeholder
67 |
68 | # Example 3: Modify default configurations
69 | custom_rel_pos_bias = RelativePositionBias(bidirectional=False, num_buckets=64, max_distance=256, n_heads=8)
70 | ```
71 |
72 | ## Tips:
73 | 1. The choice of `num_buckets` and `max_distance` might need tuning based on the dataset and application.
74 | 2. If the architecture doesn't need bidirectional biases, set `bidirectional` to `False` to reduce computation.
75 | 3. Ensure that the device of tensors being processed and the device of the `RelativePositionBias` module are the same.
76 |
77 | ## References:
78 | - [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
79 | - [Transformer Architectures](https://www.aclweb.org/anthology/D18-1422.pdf)
80 |
81 | Note: This documentation is based on the provided code and might need adjustments when integrated into the complete `zeta` library.
--------------------------------------------------------------------------------
/docs/zeta/nn/biases/xpos.md:
--------------------------------------------------------------------------------
1 | # XPOS Module Documentation
2 | -------------------------
3 |
4 | ### Architecture
5 |
6 | The XPOS module is a part of a neural network model and is implemented as a subclass of `torch.nn.Module`. It consists of several functions and a class that work together to apply rotary positional embeddings to an input tensor.
7 |
8 | ### Purpose
9 |
10 | The purpose of the XPOS module is to incorporate positional information into the input tensor of a neural network model. It achieves this by generating fixed positional embeddings and applying them to the input tensor using rotary positional encoding techniques. This allows the model to capture the sequential order and relative positions of the input elements, which can be beneficial for tasks such as natural language processing and time series analysis.
11 |
12 | ### Functions and Methods
13 |
14 | 1. `fixed_pos_embedding(x)`: Generates fixed positional embeddings for the input tensor.
15 |
16 | - Args:
17 | - `x` (torch.Tensor): Input tensor of shape `(seq_len, dim)`.
18 | - Returns:
19 | - `sin` (torch.Tensor): Sine positional embeddings of shape `(seq_len, dim)`.
20 | - `cos` (torch.Tensor): Cosine positional embeddings of shape `(seq_len, dim)`.
21 | 2. `rotate_every_two(x)`: Rearranges the elements of the input tensor by rotating every two elements.
22 |
23 | - Args:
24 | - `x` (torch.Tensor): Input tensor of shape `(batch_size, seq_len, dim)`.
25 | - Returns:
26 | - `x` (torch.Tensor): Rearranged tensor of shape `(batch_size, seq_len, dim)`.
27 | 3. `duplicate_interleave(m)`: Duplicates a matrix while interleaving the copy.
28 |
29 | - Args:
30 | - `m` (torch.Tensor): Input matrix.
31 | - Returns:
32 | - `m` (torch.Tensor): Duplicated and interleaved matrix.
33 | 4. `apply_rotary_pos_emb(x, sin, cos, scale=1)`: Applies rotary positional embeddings to the input tensor.
34 |
35 | - Args:
36 | - `x` (torch.Tensor): Input tensor of shape `(batch_size, seq_len, dim)`.
37 | - `sin` (torch.Tensor): Sine positional embeddings of shape `(seq_len, dim)`.
38 | - `cos` (torch.Tensor): Cosine positional embeddings of shape `(seq_len, dim)`.
39 | - `scale` (float): Scaling factor for the positional embeddings (default: 1).
40 | - Returns:
41 | - `x` (torch.Tensor): Tensor with applied rotary positional embeddings.
42 | 5. `XPOS(head_dim, scale_base=512)`: XPOS module class.
43 |
44 | - Args:
45 | - `head_dim` (int): Dimensionality of the input tensor.
46 | - `scale_base` (int): Base value for scaling the positional embeddings (default: 512).
47 | - Methods:
48 | - `forward(x, offset=0, downscale=False)`: Forward pass of the XPOS module.
49 | - Args:
50 | - `x` (torch.Tensor): Input tensor of shape `(batch_size, seq_len, dim)`.
51 | - `offset` (int): Offset value for positional embeddings (default: 0).
52 | - `downscale` (bool): Boolean indicating whether to downscale the positional embeddings (default: False).
53 | - Returns:
54 | - `x` (torch.Tensor): Tensor with applied rotary positional embeddings.
55 |
56 | ### Usage Examples
57 |
58 | 1. Applying XPOS module to an input tensor:
59 |
60 | ```
61 | import torch
62 | from xpos import XPOS
63 |
64 | # Create an instance of the XPOS module
65 | xpos = XPOS(head_dim=256)
66 |
67 | # Generate a random input tensor
68 | x = torch.randn(1, 10, 256)
69 |
70 | # Apply the XPOS module to the input tensor
71 | output = xpos(x)
72 | ```
73 |
74 |
75 | 2. Applying XPOS module with offset and downscaling:
76 |
77 | ```
78 | import torch
79 | from zeta import XPOS
80 |
81 | # Create an instance of the XPOS module
82 | xpos = XPOS(head_dim=512)
83 |
84 | # Generate a random input tensor
85 | x = torch.randn(1, 20, 512)
86 |
87 | # Apply the XPOS module to the input tensor with offset and downscaling
88 | output = xpos(x, offset=2, downscale=True)
89 | ```
90 | 3. Using the individual functions of the XPOS module:
91 |
92 | ```
93 | import torch
94 | from zeta import fixed_pos_embedding, apply_rotary_pos_emb
95 |
96 | # Generate fixed positional embeddings
97 | scale = torch.randn(10, 256)
98 | sin, cos = fixed_pos_embedding(scale)
99 |
100 | # Apply rotary positional embeddings to an input tensor
101 | x = torch.randn(1, 10, 256)
102 | output = apply_rotary_pos_emb(x, sin, cos, scale=0.5)
103 | ```
104 |
105 | Note: The above examples assume that the `xpos.py` file
--------------------------------------------------------------------------------
/docs/zeta/nn/embeddings/multiway.md:
--------------------------------------------------------------------------------
1 | # **Documentation for `MultiwayEmbedding` in Zeta Library**
2 |
3 | **Table of Contents**
4 |
5 | 1. Overview
6 | 2. Class Definition and Parameters
7 | 3. Methods and Functionalities
8 | 4. Usage Examples
9 | 5. Additional Tips and Information
10 | 6. References
11 |
12 | ---
13 |
14 | ## 1. Overview
15 |
16 | The `MultiwayEmbedding` class in the Zeta library provides a way to apply two separate embeddings to two distinct parts of the input tensor. It splits the input tensor at the specified position and applies one embedding to the first part and another embedding to the second part. This can be particularly useful when dealing with inputs that require diverse representations or embeddings.
17 |
18 | ---
19 |
20 | ## 2. Class Definition and Parameters
21 |
22 | ```python
23 | class MultiwayEmbedding(MultiwayNetwork):
24 | """
25 | A specialized version of the MultiwayNetwork to perform multi-way embeddings on an input tensor.
26 |
27 | Parameters:
28 | - modules (List[nn.Module]): A list containing exactly two PyTorch modules. Typically these would be embedding layers.
29 | - dim (int): The dimension along which to split and concatenate the input tensor. Default is 1.
30 | """
31 |
32 | def __init__(self, modules, dim=1):
33 | super(MultiwayNetwork, self).__init__()
34 | ...
35 | ```
36 |
37 | ---
38 |
39 | ## 3. Methods and Functionalities
40 |
41 | **forward(x, **kwargs)**
42 | ```python
43 | def forward(self, x, **kwargs):
44 | """
45 | Forward method to apply embeddings on the split input tensor.
46 |
47 | Parameters:
48 | - x (torch.Tensor): The input tensor.
49 | - **kwargs: Additional arguments that might be needed for the embeddings.
50 |
51 | Returns:
52 | - torch.Tensor: Concatenated tensor after applying the embeddings.
53 | """
54 | ...
55 | ```
56 |
57 | ---
58 |
59 | ## 4. Usage Examples
60 |
61 | **Example 1:** Basic Usage
62 | ```python
63 | from zeta import MultiwayEmbedding
64 | import torch.nn as nn
65 |
66 | emb1 = nn.Embedding(10, 5)
67 | emb2 = nn.Embedding(10, 5)
68 | multiway_emb = MultiwayEmbedding([emb1, emb2])
69 |
70 | x = torch.LongTensor([[1,2,3],[4,5,6]])
71 | output = multiway_emb(x)
72 | print(output)
73 | ```
74 |
75 | **Example 2:** Setting a Split Position
76 | ```python
77 | from zeta import MultiwayEmbedding, set_split_position
78 | import torch.nn as nn
79 |
80 | emb1 = nn.Embedding(10, 5)
81 | emb2 = nn.Embedding(10, 5)
82 | multiway_emb = MultiwayEmbedding([emb1, emb2])
83 | multiway_emb.apply(set_split_position(2))
84 |
85 | x = torch.LongTensor([[1,2,3],[4,5,6]])
86 | output = multiway_emb(x)
87 | print(output)
88 | ```
89 |
90 | **Example 3:** Working with Different Embedding Dimensions
91 | ```python
92 | from zeta import MultiwayEmbedding
93 | import torch.nn as nn
94 |
95 | emb1 = nn.Embedding(10, 5)
96 | emb2 = nn.Embedding(10, 7)
97 | multiway_emb = MultiwayEmbedding([emb1, emb2], dim=2)
98 |
99 | x = torch.LongTensor([[1,2,3],[4,5,6]])
100 | output = multiway_emb(x)
101 | print(output)
102 | ```
103 |
104 | ---
105 |
106 | ## 5. Additional Tips and Information
107 |
108 | - Ensure that the input tensor's dimensions align with the expected embeddings. If there's a mismatch in dimensions, a runtime error will occur.
109 | - The split position determines the point at which the tensor is divided. It's crucial to set this appropriately, especially if the embeddings have different dimensions.
110 | - Using the provided `set_split_position` utility function makes it easy to apply the split position for the embeddings.
111 |
112 | ---
113 |
114 | ## 6. References
115 |
116 | - Torch documentation: [Link to PyTorch Documentation](https://pytorch.org/docs/stable/index.html)
117 | - Agora: [Link to Agora's GitHub](#) (assuming there might be a GitHub link or other resource for Agora)
118 |
119 | ---
120 |
121 | **Note:** Ensure that the tensor operations align mathematically, especially if you're concatenating tensors with different dimensions. In such cases, ensure the embeddings produce tensors that can be concatenated along the specified dimension.
122 |
123 | **Mathematical Explanation:** Given an input tensor \( X \) split into \( X_1 \) and \( X_2 \), and two embeddings \( A \) and \( B \), the output is given by concatenating \( A(X_1) \) and \( B(X_2) \).
--------------------------------------------------------------------------------
/docs/zeta/nn/embeddings/rope.md:
--------------------------------------------------------------------------------
1 | # RotaryEmbedding
2 |
3 | `RotaryEmbedding` is a PyTorch module implementing the rotary embedding mechanism. It is designed to handle sequences of any length without the need for fine-tuning, and can also incorporate positional information into the embeddings.
4 |
5 | ## Class Definition
6 |
7 | ```python
8 | class RotaryEmbedding(nn.Module):
9 | def __init__(
10 | self,
11 | dim,
12 | use_xpos=False,
13 | scale_base=512,
14 | interpolation_factor=1.,
15 | base=10000,
16 | base_rescale_factor=1.,
17 | ):
18 | ...
19 | ```
20 |
21 | ### Parameters
22 |
23 | - `dim` (int): The dimensionality of the embeddings.
24 | - `use_xpos` (bool, optional): Whether to use positional information in the embeddings. Default: `False`.
25 | - `scale_base` (int, optional): Base of the scale for positional information. Default: `512`.
26 | - `interpolation_factor` (float, optional): Factor used for interpolating the embeddings. Default: `1.0`.
27 | - `base` (int, optional): Base of the frequencies used in the embeddings. Default: `10000`.
28 | - `base_rescale_factor` (float, optional): Factor used for rescaling the base of the frequencies. Default: `1.0`.
29 |
30 | ### Method: `forward`
31 |
32 | ```python
33 | def forward(self, seq_len, device):
34 | ...
35 | ```
36 |
37 | #### Parameters
38 |
39 | - `seq_len` (int): The length of the sequence.
40 | - `device` (torch.device): The device on which the computation will be performed.
41 |
42 | #### Returns
43 |
44 | - `freqs` (Tensor): The computed frequencies for the embeddings.
45 | - `scale` (Tensor): The computed scale for the embeddings.
46 |
47 | ## Functionality and Usage
48 |
49 | The `RotaryEmbedding` module computes rotary embeddings for a sequence of a given length. The embeddings are computed based on the frequency and scale of each position in the sequence. The frequency and scale are computed using the `inv_freq` and `scale` buffers registered in the module.
50 |
51 | The `forward` method computes the `freqs` and `scale` tensors based on the `seq_len` and `device` provided. The `freqs` tensor is computed by multiplying the `t` tensor, which contains the indices of the sequence, with the `inv_freq` tensor. The `scale` tensor is computed using the `scale` buffer and the `scale_base` parameter.
52 |
53 | The `freqs` and `scale` tensors are then concatenated along the last dimension and returned.
54 |
55 | ### Usage Examples
56 |
57 | #### Example 1: Basic Usage
58 |
59 | ```python
60 | from zeta.nn import RotaryEmbedding
61 | import torch
62 | from torch import nn
63 |
64 | # Initialize the RotaryEmbedding module
65 | rotary_embedding = RotaryEmbedding(dim=64, use_xpos=True)
66 |
67 | # Compute the embeddings for a sequence of length 10
68 | seq_len = 10
69 | device = torch.device('cuda')
70 | freqs, scale = rotary_embedding(seq_len, device)
71 |
72 | print(freqs)
73 | print(scale)
74 | ```
75 |
76 | #### Example 2: Using a Different Scale Base
77 |
78 | ```python
79 | from zeta.nn import RotaryEmbedding
80 | import torch
81 | from torch import nn
82 |
83 | # Initialize the RotaryEmbedding module with a different scale base
84 | rotary_embedding = RotaryEmbedding(dim=64, use_xpos=True, scale_base=1024)
85 |
86 | # Compute the embeddings for a sequence of length 10
87 | seq_len = 10
88 | device = torch.device('cuda')
89 | freqs, scale = rotary_embedding(seq_len, device)
90 |
91 | print(freqs)
92 | print(scale)
93 | ```
94 |
95 | #### Example 3: Without Positional Information
96 |
97 | ```python
98 | from zeta.nn import RotaryEmbedding
99 | import torch
100 | from torch import nn
101 |
102 | # Initialize the RotaryEmbedding module without positional information
103 | rotary_embedding = RotaryEmbedding(dim=64, use_xpos=False)
104 |
105 | # Compute the embeddings for a sequence of length 10
106 | seq_len = 10
107 | device = torch.device('cuda')
108 | freqs, scale = rotary_embedding(seq_len, device)
109 |
110 | print(freqs)
111 | print(scale)
112 | ```
113 |
114 | ## Mathematical Formula
115 |
116 | The mathematical formula for computing the `freqs` tensor is:
117 |
118 | \[ \text{freqs} = t \cdot \text{inv\_freq} \]
119 |
120 | Where:
121 | - \( t \) is a tensor containing the indices of the sequence.
122 | - \( \text{inv\_freq} \) is a tensor containing the inverse frequencies.
123 |
124 | The mathematical formula for computing the `scale` tensor is:
125 |
126 | \[ \text{scale} = \text{scale}^{\frac{\text{power}}{\text{scale\_base}}} \]
127 |
128 | Where:
129 | - \( \text{power} \) is a tensor containing the power of each position in the sequence.
130 | - \( \text{scale\_base} \) is a scalar containing the base of the scale.
131 | - \( \text{scale} \) is a tensor containing the scale of each position in the sequence.
132 |
133 | ## Additional Information and Tips
134 |
135 | - The `interpolation_factor` parameter can be used to interpolate the embeddings for sequences of different lengths. A larger `interpolation_factor` will result in a smoother interpolation.
136 | - The `base_rescale_factor` parameter can be used to rescale the base of the frequencies. This can be useful for adjusting the embeddings for sequences of different lengths.
137 | - If `use_xpos` is set to `False`, the `scale` tensor will not be used, and the `freqs` tensor will be returned as is.
138 |
139 | ## References and Resources
140 |
141 | - [Paper: Link to the paper](https://arxiv.org/pdf/2308.10882.pdf)
142 | - [PyTorch Documentation](https://pytorch.org/docs/stable/indehtml)
143 | - [Einops Documentation](https://einops.rocks/pytorch-examples.html)
144 |
145 | Note: The above template includes the class definition, parameters, description, functionality, usage examples, mathematical formula, additional information and tips, and references and resources. To replicate the documentation for any other module or framework, follow the same structure and provide the specific details for that module or framework.
--------------------------------------------------------------------------------
/docs/zeta/nn/embeddings/truncated_rope.md:
--------------------------------------------------------------------------------
1 | # Module/Function Name: TruncatedRotaryEmbedding
2 |
3 | The `TruncatedRotaryEmbedding` class is part of the Zeta library and is designed to implement the rotary embeddings with a truncation mechanism. The rotary embedding is a positional encoding method that aims to provide the model with information about the relative positions of the tokens in a sequence. The `TruncatedRotaryEmbedding` class extends the rotary embedding concept by incorporating a truncation mechanism, which sets the rotary embedding to zero for positions where the frequency is higher than a specified threshold.
4 |
5 | The architecture and workings of this class are inspired by the paper [link to the paper](https://arxiv.org/pdf/2308.10882.pdf).
6 |
7 | ## Parameters:
8 |
9 | - `dim` (int): Dimensionality of the embeddings.
10 | - `a` (float): Lower bound of the truncation region. Rotary embeddings with frequency lower than `a` will be set to zero.
11 | - `b` (float): Upper bound of the truncation region. Rotary embeddings with frequency higher than or equal to `b` will not be truncated.
12 | - `rho` (float): Value to which the rotary embeddings will be truncated in the region [a, b).
13 |
14 | The `dim` parameter is required to determine the dimensionality of the embeddings, while `a`, `b`, and `rho` are hyperparameters that control the truncation mechanism.
15 |
16 | ## Method:
17 |
18 | ### `forward(seq_len, device)`
19 |
20 | Computes the truncated rotary embeddings for a given sequence length.
21 |
22 | #### Parameters:
23 |
24 | - `seq_len` (int): Length of the sequence for which the rotary embeddings are to be computed.
25 | - `device` (torch.device): Device on which the computations are to be performed.
26 |
27 | #### Returns:
28 |
29 | - `result` (Tensor): A tensor containing the truncated rotary embeddings for the specified sequence length.
30 |
31 | ## Functionality and Usage:
32 |
33 | The `TruncatedRotaryEmbedding` class is used to compute the truncated rotary embeddings for a given sequence length. The rotary embeddings are computed by multiplying a tensor containing the position indices of the tokens in the sequence by the inverse frequencies. The inverse frequencies are computed based on the specified embedding dimension `dim` and are stored in the `inv_freq` buffer.
34 |
35 | The truncation mechanism is implemented by creating a `theta_star` tensor, which is used to multiply the computed `freqs`. The `theta_star` tensor is created based on the specified `a`, `b`, and `rho` parameters, and the computed `freqs` tensor. For positions where the frequency is higher than or equal to `b`, the rotary embeddings are not truncated, and `theta_star` is set to the frequency at that position. For positions where the frequency is lower than `a`, the rotary embeddings are set to zero, and `theta_star` is set to zero. For positions where the frequency is in the range [a, b], the rotary embeddings are truncated to `rho`, and `theta_star` is set to `rho`.
36 |
37 | Once the `theta_star` tensor is created, it is multiplied element-wise by the `freqs` tensor to compute the final truncated rotary embeddings.
38 |
39 | ### Usage Example:
40 |
41 | ```python
42 | from zeta.nn.embeddings.truncated_rope import TruncatedRotaryEmbedding
43 | import torch
44 |
45 | # Define the parameters
46 | dim = 64
47 | a = 0.1
48 | b = 0.9
49 | rho = 0.5
50 | seq_len = 100
51 | device = torch.device('cuda')
52 |
53 | # Create the TruncatedRotaryEmbedding module
54 | trunc_rotary_emb = TruncatedRotaryEmbedding(dim, a, b, rho)
55 |
56 | # Compute the truncated rotary embeddings for the specified sequence length
57 | rotary_embeddings = trunc_rotary_emb(seq_len, device)
58 |
59 | print(rotary_embeddings)
60 | ```
61 |
62 | In this example, the `TruncatedRotaryEmbedding` module is created with the specified `dim`, `a`, `b`, and `rho` parameters. The `forward` method is then called with the specified `seq_len` and `device` parameters to compute the truncated rotary embeddings for a sequence of length `seq_len`.
63 |
64 | ## Additional Information and Tips:
65 |
66 | - The `a`, `b`, and `rho` parameters control the truncation mechanism and may need to be tuned based on the specific application and data being used. In particular, the `a` parameter should be set to a value that effectively removes the high-frequency noise in the rotary embeddings, while the `b` parameter should be set to a value that retains the useful positional information in the rotary embeddings.
67 |
68 | - The `dim` parameter should be set to the same value as the embedding dimension used in the model.
69 |
70 | - The `device` parameter in the `forward` method should be set to the same device on which the model is being trained.
71 |
72 | ## Mathematical Formulation:
73 |
74 | The mathematical formulation of the truncated rotary embeddings can be expressed as follows:
75 |
76 | \[ \text{freqs} = t \cdot \text{inv\_freq} \]
77 |
78 | \[ \theta = \text{base}^{-2 \cdot i / \text{dim}}, \, i = 0, 2, \ldots, \text{dim}-2 \]
79 |
80 | \[ \theta^* =
81 | \begin{cases}
82 | 0, & \text{if } \theta < a \\
83 | \rho, & \text{if } a \leq \theta < b \\
84 | \theta, & \text{if } \theta \geq b
85 | \end{cases}
86 | \]
87 |
88 | \[ \text{result} = \text{freqs} \cdot \theta^* \]
89 |
90 | Where:
91 |
92 | - \( t \) is a tensor containing the position indices of the tokens in the sequence.
93 | - \( \text{inv\_freq} \) is a tensor containing the inverse frequencies computed based on the specified `dim` parameter.
94 | - \( \text{freqs} \) is a tensor containing the computed frequencies for each position in the sequence.
95 | - \( \theta \) is a tensor containing the computed theta values for each position in the sequence.
96 | - \( \theta^* \) is a tensor containing the truncated theta values for each position in the sequence.
97 | - \( \text{result} \) is the final tensor containing the truncated rotary embeddings for each position in the sequence.
98 |
99 | ## References and Resources:
100 |
101 | - Paper: [Link to the paper](https://arxiv.org/pdf/2308.10882.pdf)
102 |
103 | For further exploration and implementation details, refer to the paper linked above.
--------------------------------------------------------------------------------
/docs/zeta/nn/modules/lora.md:
--------------------------------------------------------------------------------
1 | # Lora
2 |
3 | The `Lora` class is a module of the Zeta library that provides a simple linear transformation of the input data. It is a part of the `torch.nn` module and extends the `nn.Module` class from PyTorch.
4 |
5 | ## Overview and Introduction
6 |
7 | The `Lora` class is designed to provide a scalable and efficient linear transformation operation. It is particularly useful in scenarios where the dimensionality of the input data is very high and computational efficiency is of paramount importance. The `Lora` class achieves this by breaking down the weight matrix into two lower rank matrices `A` and `B`, and a scale factor `alpha`, which are learned during the training process. This results in a significant reduction in the number of parameters to be learned, and consequently, a more computationally efficient model.
8 |
9 | ## Key Concepts and Terminology
10 |
11 | - **Linear Transformation**: A linear transformation is a mathematical operation that transforms input data by multiplying it with a weight matrix. It is a fundamental operation in many machine learning models.
12 |
13 | - **Low Rank Approximation**: Low rank approximation is a technique used to approximate a matrix by another matrix of lower rank. This is often used to reduce the dimensionality of data and to make computations more efficient.
14 |
15 | - **Scale Factor**: A scale factor is a number by which a quantity is multiplied, changing the magnitude of the quantity.
16 |
17 | ## Class Definition
18 |
19 | The `Lora` class is defined as follows:
20 |
21 | ```python
22 | class Lora(nn.Module):
23 | def __init__(
24 | self,
25 | dim,
26 | dim_out,
27 | r=8,
28 | alpha=None
29 | ):
30 | super().__init__()
31 | self.scale = alpha / r
32 |
33 | self.A = nn.Parameter(torch.randn(dim, r))
34 | self.B = nn.Parameter(torch.randn(r, dim_out))
35 |
36 | @property
37 | def weight(self):
38 | return (self.A @ self.B) * self.scale
39 |
40 | def forward(self, x):
41 | return x @ self.weight
42 | ```
43 |
44 | ### Parameters
45 |
46 | - `dim` (`int`): The dimensionality of the input data. It is the number of features in the input data.
47 | - `dim_out` (`int`): The desired dimensionality of the output data. It is the number of features in the output data.
48 | - `r` (`int`, optional): The rank of the matrices `A` and `B`. It determines the size of the matrices `A` and `B`. Default is 8.
49 | - `alpha` (`float`, optional): The scale factor. If not provided, it is set to 1 by default.
50 |
51 | ### Methods
52 |
53 | #### `forward`
54 |
55 | The `forward` method is used to compute the forward pass of the `Lora` module.
56 |
57 | ##### Parameters
58 |
59 | - `x` (`Tensor`): The input data. It is a tensor of shape `(batch_size, dim)`.
60 |
61 | ##### Returns
62 |
63 | - `Tensor`: The transformed data. It is a tensor of shape `(batch_size, dim_out)`.
64 |
65 | ## Functionality and Usage
66 |
67 | The `Lora` class is used to perform a linear transformation of the input data. The transformation is defined by the weight matrix `W`, which is approximated by the product of two lower rank matrices `A` and `B`, and a scale factor `alpha`. The `Lora` class learns the matrices `A` and `B`, and the scale factor `alpha` during the training process.
68 |
69 | The forward pass of the `Lora` module computes the product of the input data `x` and the weight matrix `W`, which is approximated by `(A @ B) * scale`.
70 |
71 | ### Mathematical Formula
72 |
73 | The mathematical formula for the forward pass of the `Lora` module is:
74 |
75 | \[ y = xW \]
76 |
77 | Where:
78 | - \( y \) is the transformed data.
79 | - \( x \) is the input data.
80 | - \( W \) is the weight matrix, which is approximated by \( (A @ B) * \text{scale} \).
81 |
82 | ### Usage Examples
83 |
84 | Below are three examples of how to use the `Lora` class.
85 |
86 | #### Example 1: Basic Usage
87 |
88 | ```python
89 | import torch
90 | from zeta import Lora
91 |
92 | # Define the input data
93 | x = torch.randn(32, 128) # batch size of 32, and 128 features
94 |
95 | # Define the Lora module
96 | lora = Lora(dim=128, dim_out=64)
97 |
98 | # Compute the forward pass
99 | y = lora(x)
100 | ```
101 |
102 | #### Example 2: Specifying the Rank and Scale Factor
103 |
104 | ```python
105 | import torch
106 | from zeta import Lora
107 |
108 | # Define the input data
109 | x = torch.randn(32, 128) # batch size of 32, and 128 features
110 |
111 | # Define the Lora module with specified rank and scale factor
112 | lora = Lora(dim=128, dim_out=64, r=16, alpha=0.1)
113 |
114 | # Compute the forward pass
115 | y = lora(x)
116 | ```
117 |
118 | #### Example 3: Using the Lora Module in a Neural Network
119 |
120 | ```python
121 | import torch
122 | from torch import nn
123 | from zeta import Lora
124 |
125 | # Define a simple neural network with a Lora layer
126 | class Net(nn.Module):
127 | def __init__(self):
128 | super().__init__()
129 | self.lora = Lora(dim=128, dim_out=64)
130 | self.fc = nn.Linear(64, 10)
131 |
132 | def forward(self, x):
133 | x = self.lora(x)
134 | x = self.fc(x)
135 | return x
136 |
137 | # Define the input data
138 | x = torch.randn(32, 128) # batch size of 32, and 128 features
139 |
140 | # Define the model
141 | model = Net()
142 |
143 | # Compute the forward pass
144 | output = model(x)
145 | ```
146 |
147 | ## Additional Information and Tips
148 |
149 | - The `Lora` class is particularly useful in scenarios where the dimensionality of the input data is very high and computational efficiency is of paramount importance. However, it may not be suitable for all applications, as the approximation of the weight matrix may result in a loss of accuracy.
150 |
151 | - The rank `r` and the scale factor `alpha` are hyperparameters that need to be tuned for the specific application. A higher value of `r` will
152 |
153 | result in a more accurate approximation of the weight matrix, but will also increase the computational cost. Similarly, the scale factor `alpha` needs to be tuned to achieve the desired trade-off between accuracy and computational efficiency.
154 |
155 | ## References and Resources
156 |
157 | - [PyTorch nn.Module documentation](https://pytorch.org/docs/stable/generated/torch.nn.Module.html)
158 | - [Low Rank Matrix Factorization for Deep Neural Network Training with High-dimensional Output Targets](https://arxiv.org/abs/2005.08735)
159 |
160 | For further exploration and implementation details, you can refer to the above resources and the official PyTorch documentation.
--------------------------------------------------------------------------------
/docs/zeta/nn/modules/token_learner.md:
--------------------------------------------------------------------------------
1 | # Zeta Library Documentation
2 |
3 | ## Module Name: TokenLearner
4 |
5 | The `TokenLearner` is a PyTorch module designed for learning tokens from input data. It is a part of the Zeta library, a collection of modules and functions designed for efficient and flexible implementation of various deep learning tasks. The `TokenLearner` class is particularly useful for tasks such as image classification, object detection, and other applications where it is beneficial to extract tokens (representative features) from the input data.
6 |
7 | ## Introduction
8 |
9 | In various deep learning tasks, it is common to extract tokens (representative features) from the input data. These tokens are then used for downstream tasks like classification, detection, etc. The `TokenLearner` class is designed to efficiently extract tokens from the input data. It does this by utilizing a convolutional neural network (CNN) with grouped convolutions and a gating mechanism.
10 |
11 | ## Class Definition
12 |
13 | ```python
14 | class TokenLearner(nn.Module):
15 | def __init__(
16 | self,
17 | *,
18 | dim: int = None,
19 | ff_mult: int = 2,
20 | num_output_tokens: int = 8,
21 | num_layers: int = 2
22 | ):
23 | ...
24 | ```
25 |
26 | ### Parameters:
27 |
28 | - `dim` (int, optional): The dimension of the input data. Default is `None`.
29 | - `ff_mult` (int, optional): The factor by which the inner dimension of the network will be multiplied. Default is `2`.
30 | - `num_output_tokens` (int, optional): The number of tokens to be output by the network. Default is `8`.
31 | - `num_layers` (int, optional): The number of layers in the network. Default is `2`.
32 |
33 | ## Functionality and Usage
34 |
35 | The `TokenLearner` class is a PyTorch `nn.Module` that learns tokens from the input data. The input data is first packed and then processed through a series of grouped convolutions followed by a gating mechanism. The output is a set of tokens that are representative of the input data.
36 |
37 | The forward method of the `TokenLearner` class takes an input tensor `x` and performs the following operations:
38 |
39 | 1. The input tensor `x` is packed using the `pack_one` helper function.
40 | 2. The packed tensor is then rearranged and passed through a series of grouped convolutions and activation functions.
41 | 3. The output of the convolutions is then rearranged and multiplied with the input tensor.
42 | 4. The resulting tensor is then reduced to obtain the final tokens.
43 |
44 | ### Method:
45 |
46 | ```python
47 | def forward(self, x):
48 | ...
49 | ```
50 |
51 | ### Parameters:
52 |
53 | - `x` (Tensor): The input tensor of shape `(batch_size, channels, height, width)`.
54 |
55 | ### Returns:
56 |
57 | - `x` (Tensor): The output tokens of shape `(batch_size, channels, num_output_tokens)`.
58 |
59 | ## Usage Examples
60 |
61 | ### Example 1: Basic Usage
62 |
63 | ```python
64 | from zeta import TokenLearner
65 | import torch
66 |
67 | # Initialize the TokenLearner
68 | token_learner = TokenLearner(dim=64)
69 |
70 | # Generate some random input data
71 | x = torch.randn(1, 64, 32, 32)
72 |
73 | # Forward pass
74 | tokens = token_learner.forward(x)
75 |
76 | print(tokens.shape)
77 | ```
78 |
79 | In this example, a `TokenLearner` is initialized with an input dimension of 64. A random tensor of shape `(1, 64, 32, 32)` is then passed through the `TokenLearner` to obtain the tokens. The output will be a tensor of shape `(1, 64, 8)`.
80 |
81 | ### Example 2: Custom Parameters
82 |
83 | ```python
84 | from zeta import TokenLearner
85 | import torch
86 |
87 | # Initialize the TokenLearner with custom parameters
88 | token_learner = TokenLearner(dim=128, ff_mult=4, num_output_tokens=16)
89 |
90 | # Generate some random input data
91 | x = torch.randn(2, 128, 64, 64)
92 |
93 | # Forward pass
94 | tokens = token_learner.forward(x)
95 |
96 | print(tokens.shape)
97 | # Output: torch.Size([2, 128, 16])
98 | ```
99 |
100 | In this example, a `TokenLearner` is initialized with custom parameters. A random tensor of shape `(2, 128, 64, 64)` is then passed through the `TokenLearner` to obtain the tokens. The output will be a tensor of shape `(2, 128, 16)`.
101 |
102 | ### Example 3: Integration with Other PyTorch Modules
103 |
104 | ```python
105 | from zeta import TokenLearner
106 | import torch
107 | import torch.nn as nn
108 |
109 | # Initialize the TokenLearner
110 | token_learner = TokenLearner(dim=64)
111 |
112 | # Generate some random input data
113 | x = torch.randn(1, 64, 32, 32)
114 |
115 | # Define a simple model
116 | model = nn.Sequential(
117 | token_learner,
118 | nn.Flatten(),
119 | nn.Linear(64*8, 10)
120 | )
121 |
122 | # Forward pass
123 | output = model(x)
124 |
125 | print(output.shape)
126 | # Output: torch.Size([1, 10])
127 | ```
128 |
129 | In this example, the `TokenLearner` is integrated into a simple model consisting of the `TokenLearner`, a `Flatten` layer, and a `Linear` layer. A random tensor of shape `(1, 64, 32, 32)` is then passed through the model to obtain the final output. The output will be a tensor of shape `(1, 10)`.
130 |
131 | ## Mathematical Formulation
132 |
133 | The `TokenLearner` can be mathematically formulated as follows:
134 |
135 | Let `X` be the input tensor of shape `(B, C, H, W)`, where `B` is the batch size, `C` is the number of channels, `H` is the height, and `W` is the width. The `TokenLearner` first rearranges `X` to a tensor of shape `(B, G*C, H, W)`, where `G` is the number of output tokens. This is done by repeating `X` along the channel dimension `G` times.
136 |
137 | The rearranged tensor is then passed through a series of grouped convolutions and activation functions to obtain a tensor `A` of shape `(B, G, H, W)`. This tensor is then rearranged and multiplied with the input tensor `X` to obtain a tensor of shape `(B, C, G, H, W)`.
138 |
139 | The final tokens are obtained by reducing this tensor along the `H` and `W` dimensions to obtain a tensor of shape `(B, C, G)`.
140 |
141 | ## Additional Information and Tips
142 |
143 | - The `num_output_tokens` parameter controls the number of tokens that will be output by the `TokenLearner`. A larger number of output tokens will result in a more detailed representation of the input data, but will also increase the computational requirements.
144 |
145 | - The `ff_mult` parameter controls the inner dimension of the `TokenLearner`. A larger `ff_mult` will result in a larger capacity model, but will also increase the computational requirements.
146 |
147 | - The `TokenLearner` works best with input data that has a relatively small spatial dimension (e.g. 32x32 or 64x64). For larger input sizes, it may be beneficial to use a downsampling layer (e.g. `nn.MaxPool2d`) before passing the data through the `TokenLearner`.
148 |
149 |
--------------------------------------------------------------------------------
/docs/zeta/nn/utils/helpers.md:
--------------------------------------------------------------------------------
1 | ## Documentation
2 |
3 | ### Overview
4 |
5 | The provided module comprises utility functions and classes to streamline specific operations with Python data structures and PyTorch models. The main aspects of the module are:
6 |
7 | - Checking the existence of a value.
8 | - Implementing custom call behavior through classes.
9 | - Custom decorators for function calls.
10 | - Dictionary manipulation.
11 | - Initialization of PyTorch layer parameters.
12 |
13 | ### Functions and Classes
14 |
15 | 1. **exists(val: Any) -> bool**:
16 | Checks if the provided value is not `None`.
17 |
18 | 2. **default(val: Any, d: Any) -> Any**:
19 | Returns the value if it's not `None`; otherwise, it returns a default value.
20 |
21 | 3. **once(fn: Callable) -> Callable**:
22 | A decorator ensuring that the function is only called once.
23 |
24 | 4. **eval_decorator(fn: Callable) -> Callable**:
25 | A decorator for `torch.nn.Module` methods to switch the module to `eval` mode during the function call and revert to its original mode afterwards.
26 |
27 | 5. **cast_tuple(val: Any, depth: int) -> Tuple**:
28 | Casts a value to a tuple with a specific depth.
29 |
30 | 6. **maybe(fn: Callable) -> Callable**:
31 | A decorator that calls the function only if its first argument exists.
32 |
33 | 7. **always**:
34 | A class that always returns the specified value when called.
35 |
36 | 8. **not_equals** and **equals**:
37 | Classes that, when instantiated with a value, check if another value is (not) equal to the specified value.
38 |
39 | 9. **init_zero_(layer: nn.Module) -> None**:
40 | Initializes the weights and biases of a torch layer to zero.
41 |
42 | 10. **pick_and_pop(keys: List[str], d: Dict) -> Dict**:
43 | Extracts values from a dictionary based on provided keys.
44 |
45 | 11. **group_dict_by_key(cond: Callable, d: Dict) -> Tuple[Dict, Dict]**:
46 | Groups dictionary keys based on a given condition.
47 |
48 | 12. **string_begins_with(prefix: str, str: str) -> bool**:
49 | Checks if a string starts with a specific prefix.
50 |
51 | 13. **group_by_key_prefix(prefix: str, d: Dict) -> Tuple[Dict, Dict]**:
52 | Groups dictionary items by keys starting with a specific prefix.
53 |
54 | 14. **groupby_prefix_and_trim(prefix: str, d: Dict) -> Tuple[Dict, Dict]**:
55 | Similar to `group_by_key_prefix` but also removes the prefix from keys.
56 |
57 | ### Usage Examples
58 |
59 | 1. **Using the `once` decorator**:
60 |
61 | ```python
62 | from zeta import once
63 |
64 | @once
65 | def greet():
66 | print("Hello, World!")
67 |
68 | greet() # prints "Hello, World!"
69 | greet() # Does nothing on the second call
70 | ```
71 |
72 | 2. **Using the `eval_decorator` with PyTorch**:
73 |
74 | ```python
75 | import torch.nn as nn
76 | from zeta import eval_decorator
77 |
78 | class SimpleModel(nn.Module):
79 | def __init__(self):
80 | super().__init__()
81 | self.layer = nn.Linear(10, 10)
82 |
83 | @eval_decorator
84 | def predict(self, x):
85 | return self.layer(x)
86 |
87 | model = SimpleModel()
88 | input_tensor = torch.randn(1, 10)
89 | output = model.predict(input_tensor) # Automatically switches to eval mode and back
90 | ```
91 |
92 | 3. **Dictionary Manipulation with Prefix Functions**:
93 |
94 | ```python
95 | from zeta import group_by_key_prefix
96 |
97 | sample_dict = {
98 | "user_name": "John",
99 | "user_age": 25,
100 | "order_id": 12345,
101 | "order_date": "2023-01-01"
102 | }
103 |
104 | user_data, order_data = group_by_key_prefix("user_", sample_dict)
105 | print(user_data) # {'user_name': 'John', 'user_age': 25}
106 | print(order_data) # {'order_id': 12345, 'order_date': '2023-01-01'}
107 | ```
108 |
109 | This module is a collection of general-purpose utility functions and classes, making many common operations more concise. It's beneficial when working with PyTorch models and various data manipulation tasks.
--------------------------------------------------------------------------------
/docs/zeta/tokenizers/language_tokenizer.md:
--------------------------------------------------------------------------------
1 | # Module Name: LanguageTokenizerGPTX
2 |
3 | The `LanguageTokenizerGPTX` is an embedding utility tailored for the "EleutherAI/gpt-neox-20b" transformer model. This class allows for seamless tokenization and decoding operations, abstracting away the underlying complexity of the chosen transformer's tokenizer.
4 |
5 | ## Introduction:
6 | Language tokenization is a crucial step in natural language processing tasks. This module provides an interface to tokenize and decode text using the GPT-Neox-20b transformer from the EleutherAI project. With the ability to manage end-of-string tokens, padding tokens, and a fixed model length, `LanguageTokenizerGPTX` serves as a convenient wrapper for the actual tokenizer from the transformers library.
7 |
8 | ## Class Definition:
9 |
10 | ```python
11 | class LanguageTokenizerGPTX:
12 | def __init__(self):
13 | ...
14 | def tokenize_texts(self, texts: str) -> torch.Tensor:
15 | ...
16 | def decode(self, texts: torch.Tensor) -> str:
17 | ...
18 | def __len__(self) -> int:
19 | ...
20 | ```
21 |
22 | ### Parameters:
23 | The class does not take any parameters upon instantiation. It uses predefined parameters internally to load the tokenizer.
24 |
25 | ### Methods:
26 |
27 | #### 1. `__init__(self) -> None`:
28 | Initializes the `LanguageTokenizerGPTX` object. This method loads the `AutoTokenizer` with predefined parameters.
29 |
30 | #### 2. `tokenize_texts(self, texts: str) -> torch.Tensor`:
31 | Tokenizes a given text or list of texts.
32 |
33 | - **texts** (str): The input text(s) to tokenize.
34 |
35 | **Returns**:
36 | - A torch Tensor of token IDs representing the input text(s).
37 |
38 | #### 3. `decode(self, texts: torch.Tensor) -> str`:
39 | Decodes a given tensor of token IDs back to text.
40 |
41 | - **texts** (torch.Tensor): The tensor of token IDs to decode.
42 |
43 | **Returns**:
44 | - A string representing the decoded text.
45 |
46 | #### 4. `__len__(self) -> int`:
47 | Provides the total number of tokens in the tokenizer's vocabulary.
48 |
49 | **Returns**:
50 | - An integer representing the total number of tokens.
51 |
52 | ## Usage Examples:
53 |
54 | ```python
55 | from zeta import LanguageTokenizerGPTX
56 | import torch
57 |
58 | # Initialize the tokenizer
59 | tokenizer = LanguageTokenizerGPTX()
60 |
61 | # Example 1: Tokenize a single text
62 | text = "Hello, world!"
63 | tokenized_text = tokenizer.tokenize_texts(text)
64 | print(tokenized_text)
65 |
66 | # Example 2: Decode a tokenized text
67 | decoded_text = tokenizer.decode(tokenized_text)
68 | print(decoded_text)
69 |
70 | # Example 3: Get the number of tokens in the tokenizer's vocabulary
71 | num_tokens = len(tokenizer)
72 | print(f"The tokenizer has {num_tokens} tokens.")
73 | ```
74 |
75 | ## Mathematical Formulation:
76 |
77 | Given a text \( t \) and a vocabulary \( V \) from the GPT-Neox-20b model, tokenization maps \( t \) to a sequence of token IDs \( T \) where each token ID \( t_i \) corresponds to a token in \( V \). Decoding reverses this process.
78 |
79 | \[ t \xrightarrow{\text{tokenize}} T \]
80 | \[ T \xrightarrow{\text{decode}} t \]
81 |
82 | ## Additional Information:
83 |
84 | The GPT-Neox-20b model is part of the EleutherAI project. It's a variant of the GPT architecture with tweaks in terms of model size and training. Utilizing such models require an understanding of tokenization and decoding, which this module aims to simplify.
85 |
86 | ## References:
87 |
88 | - [Transformers Library by Hugging Face](https://huggingface.co/transformers/)
89 | - [EleutherAI GPT-Neox](https://github.com/EleutherAI/gpt-neox)
90 |
91 | Note: Ensure you have the necessary packages and dependencies installed, particularly the transformers library from Hugging Face.
--------------------------------------------------------------------------------
/docs/zeta/tokenizers/multi_modal_tokenizer.md:
--------------------------------------------------------------------------------
1 | # **Documentation for Zeta Library's MultiModalTokenizer Class**
2 |
3 | ---
4 |
5 | ## **Introduction and Overview**
6 |
7 | The `MultiModalTokenizer` class is part of the Zeta Library, designed to provide tokenization capabilities for both text and image data. This enables more seamless integration and utilization of multimodal (text and image) data, especially when used with models that can handle such information simultaneously, like the CLIP model.
8 |
9 | **Key Features**:
10 |
11 | 1. **Multimodal Tokenization**: Combines text and image tokenization within one unified class.
12 | 2. **Integration with Hugging Face Transformers**: Utilizes the `CLIPProcessor` for image tokenization and `AutoTokenizer` for text tokenization.
13 | 3. **Special Tokens for Image Segmentation**: Uses special tokens `` and `` to denote image token boundaries within text.
14 | 4. **Error Handling**: Implements comprehensive error handling and logging to ensure robustness.
15 |
16 | ---
17 |
18 | ## **Class Definition**
19 |
20 | ### **MultiModalTokenizer**
21 |
22 | ```python
23 | class MultiModalTokenizer:
24 | """
25 | A tokenizer class for the kosmos model
26 |
27 | Attributes:
28 | processor(CLIPProcessor): The processor to tokenize images.
29 | tokenizer(AutoTokenizer): The tokenizer to tokenize text.
30 | im_idx(int): The Index of the "" token.
31 | im_end_idx(int): The index of the "" token.
32 | """
33 | ```
34 |
35 | #### **Parameters**:
36 |
37 | - **max_length (int, optional)**: Maximum length of the tokenized sequence. Defaults to 8192.
38 |
39 | #### **Attributes**:
40 |
41 | - **processor (CLIPProcessor)**: The processor used to tokenize images.
42 | - **tokenizer (AutoTokenizer)**: The tokenizer used to tokenize text.
43 | - **im_idx (int)**: Index of the `` token.
44 | - **im_end_idx (int)**: Index of the `` token.
45 |
46 | ---
47 |
48 | ## **Methods**
49 |
50 | ### **1. tokenize_texts**
51 |
52 | ```python
53 | def tokenize_texts(self, texts: str) -> Tuple[torch.Tensor, torch.Tensor]:
54 | """
55 | Tokenize given texts.
56 |
57 | Args:
58 | texts (str): The text to be tokenized.
59 |
60 | Returns:
61 | A tuple containing the tokenized texts and only the text tokens.
62 | """
63 | ```
64 |
65 | ### **2. tokenize_images**
66 |
67 | ```python
68 | def tokenize_images(self, images) -> torch.Tensor:
69 | """
70 | Tokenizes given images.
71 |
72 | Args:
73 | images: The images to be tokenized.
74 |
75 | Returns:
76 | The tokenized images.
77 | """
78 | ```
79 |
80 | ### **3. tokenize**
81 |
82 | ```python
83 | def tokenize(self, sample) -> Dict[str, torch.Tensor]:
84 | """
85 | Tokenizes given sample.
86 |
87 | Args:
88 | sample: The sample to be tokenized.
89 |
90 | Returns:
91 | A dictionary containing the tokenized text tokens, images, labels, and attention mask.
92 | """
93 | ```
94 |
95 | ---
96 |
97 | ## **Usage Examples**
98 |
99 | ### **Example 1: Tokenizing Texts**
100 |
101 | ```python
102 | from zeta import MultiModalTokenizer
103 | import torch
104 |
105 | tokenizer = MultiModalTokenizer()
106 | texts = ["Hello World", "Zeta Library is great!"]
107 | tokenized_texts, only_texts = tokenizer.tokenize_texts(texts)
108 | print(tokenized_texts)
109 | print(only_texts)
110 | ```
111 |
112 | ### **Example 2: Tokenizing Images**
113 |
114 | ```python
115 | from zeta import MultiModalTokenizer
116 | import torch
117 |
118 | tokenizer = MultiModalTokenizer()
119 | images = torch.randn(2, 3, 224, 224) # Assuming 2 random images of shape 3x224x224
120 | tokenized_images = tokenizer.tokenize_images(images)
121 | print(tokenized_images)
122 | ```
123 |
124 | ### **Example 3: Tokenizing Multimodal Data**
125 |
126 | ```python
127 | from zeta import MultiModalTokenizer
128 | import torch
129 |
130 | tokenizer = MultiModalTokenizer()
131 | sample = {
132 | "target_text": ["Hello World", "Zeta Library is great!"],
133 | "image": torch.randn(2, 3, 224, 224)
134 | }
135 | tokenized_data = tokenizer.tokenize(sample)
136 | print(tokenized_data)
137 | ```
138 |
139 | ---
140 |
141 | ## **Mathematical Overview**
142 |
143 | Given a text sequence \( T \) of length \( n \) and an image \( I \) represented by a tensor of shape \( C \times H \times W \), where \( C \) is the number of channels, \( H \) is the height, and \( W \) is the width:
144 |
145 | 1. The tokenized text, \( T' \), is represented as:
146 | \[ T' = [, , , T_{1}, T_{2}, ..., T_{n}, ] \]
147 |
148 | 2. The tokenized image, \( I' \), is processed using the CLIP processor to obtain a tensor representation.
149 |
150 | 3. When both text and image data are tokenized using the `tokenize` method, the output contains both \( T' \) and \( I' \) with their respective attention masks.
151 |
152 | ---
153 |
154 | ## **Additional Tips**
155 |
156 | - Ensure you have the required model weights and configurations for the specified pretrained models ("laion/CLIP-ViT-L-14-laion2B-s32B-b82K" and "EleutherAI/gpt-neox-20b") downloaded or accessible from the Hugging Face Model Hub.
157 |
158 | - Handle potential tokenization errors gracefully using try-except blocks, as demonstrated in the provided methods.
159 |
160 | ---
161 |
162 | ## **References and Resources**
163 |
164 | 1. CLIP: Connecting Vision and Language with Reinforced Loss - OpenAI: [Link](https://openai.com/blog/clip/)
165 | 2. Hugging Face's Transformers library: [Link](https://huggingface.co/transformers/)
166 | 3. Documentation on Special Tokens in Transformers: [Link](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.add_special_tokens)
167 |
168 | ---
--------------------------------------------------------------------------------
/docs/zeta/tokenizers/sentencepiece.md:
--------------------------------------------------------------------------------
1 | # SentencePieceTokenizer
2 |
3 | `SentencePieceTokenizer` is a class for tokenizing and detokenizing text using a pre-trained SentencePiece model. The SentencePiece model is a unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation tasks where the vocabulary size is predetermined prior to the neural model training. This class is a part of the zeta library which is a collection of various utility functions and classes for Natural Language Processing tasks.
4 |
5 | ## Introduction
6 |
7 | Tokenization is a crucial step in many natural language processing tasks. It involves splitting a piece of text into smaller units, called tokens. These tokens can be as small as characters or as large as words. The `SentencePieceTokenizer` class provides an efficient and easy-to-use way to tokenize and detokenize text using a SentencePiece model.
8 |
9 | The SentencePiece model is trained to find the best tokenization by dynamically adjusting the size and boundary of tokens. SentencePiece implements subword units (e.g., byte-pair-encoding (BPE) and unigram language model with the extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end system that does not depend on language-specific pre/postprocessing.
10 |
11 | ## Class Definition
12 |
13 | ```python
14 | class SentencePieceTokenizer:
15 | def __init__(self, model_path: str):
16 | ...
17 | ```
18 |
19 | ### Parameters:
20 |
21 | - `model_path (str)`: The path to the pre-trained SentencePiece model. It should be a file with `.model` extension.
22 |
23 | ### Attributes:
24 |
25 | - `n_words (int)`: The vocabulary size of the SentencePiece model.
26 | - `bos_id (int)`: The token ID for the beginning of sentence token.
27 | - `eos_id (int)`: The token ID for the end of sentence token.
28 | - `pad_id (int)`: The token ID for the padding token.
29 | - `prefix_id (int, optional)`: The token ID for the prefix token.
30 | - `middle_id (int, optional)`: The token ID for the middle token.
31 | - `suffix_id (int, optional)`: The token ID for the suffix token.
32 | - `eot_id (int, optional)`: The token ID for the end of text token.
33 |
34 | ## Methods
35 |
36 | ### `encode`
37 |
38 | ```python
39 | def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
40 | ...
41 | ```
42 |
43 | Encodes a string into a list of integer token IDs.
44 |
45 | #### Parameters:
46 |
47 | - `s (str)`: The string to be encoded.
48 | - `bos (bool)`: Whether to add the beginning of sentence token at the start.
49 | - `eos (bool)`: Whether to add the end of sentence token at the end.
50 |
51 | #### Returns:
52 |
53 | - `List[int]`: A list of integer token IDs.
54 |
55 | ### `decode`
56 |
57 | ```python
58 | def decode(self, t: List[int]) -> str:
59 | ...
60 | ```
61 |
62 | Decodes a list of integer token IDs into a string.
63 |
64 | #### Parameters:
65 |
66 | - `t (List[int])`: A list of integer token IDs to be decoded.
67 |
68 | #### Returns:
69 |
70 | - `str`: The decoded string.
71 |
72 | ### `encode_infilling`
73 |
74 | ```python
75 | def encode_infilling(self, s: str) -> List[int]:
76 | ...
77 | ```
78 |
79 | Encodes a string without an implicit leading space.
80 |
81 | #### Parameters:
82 |
83 | - `s (str)`: The string to be encoded.
84 |
85 | #### Returns:
86 |
87 | - `List[int]`: A list of integer token IDs.
88 |
89 | ### `decode_infilling`
90 |
91 | ```python
92 | def decode_infilling(self, t: List[int]) -> str:
93 | ...
94 | ```
95 |
96 | Decodes a list of integer token IDs into a string without an implicit leading space.
97 |
98 | #### Parameters:
99 |
100 | - `t (List[int])`: A list of integer token IDs to be decoded.
101 |
102 | #### Returns:
103 |
104 | - `str`: The decoded string.
105 |
106 | ## Usage Examples
107 |
108 | ### Example 1:
109 |
110 | ```python
111 | from zeta import SentencePieceTokenizer
112 |
113 | tokenizer = SentencePieceTokenizer(model_path='path/to/your/model.model')
114 | text = "Hello, world!"
115 | tokens = tokenizer.encode(text, bos=True, eos=True)
116 | print(tokens)
117 | # [2, 284, 16, 250, 13, 849, 4, 3]
118 |
119 | decoded_text = tokenizer.decode(tokens)
120 | print(decoded_text)
121 | # "Hello, world!"
122 | ```
123 |
124 | ### Example 2:
125 |
126 | ```python
127 | from zeta import SentencePieceTokenizer
128 |
129 | tokenizer = SentencePieceTokenizer(model_path='path/to/your/model.model')
130 | text = "Hello, world!"
131 | tokens = tokenizer.encode_infilling(text)
132 | print(tokens)
133 | # [284, 16, 250, 13, 849, 4]
134 |
135 | decoded_text = tokenizer.decode_infilling(tokens)
136 | print(decoded_text)
137 | # "Hello, world!"
138 | ```
139 |
140 | ### Example 3:
141 |
142 | ```python
143 | from zeta import SentencePieceTokenizer
144 |
145 | tokenizer = SentencePieceTokenizer(model_path='path/to/your/model.model')
146 | tokens = [2, 284, 16, 250, 13, 849, 4, 3]
147 | decoded_text = tokenizer.decode(tokens)
148 | print(decoded_text)
149 | # "Hello, world!"
150 | ```
151 |
152 | ## Additional Information
153 |
154 | - Make sure that the model file specified in `model_path` exists.
155 | - The special tokens such as ``, ``, ``, `` are optional and may not be present in all SentencePiece models.
156 |
157 | ## References and Resources
158 |
159 | - [SentencePiece GitHub Repository](https://github.com/google/sentencepiece)
160 | - [SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Generation](https://arxiv.org/abs/1808.06226)
161 |
162 | ## Mathematical Formulation
163 |
164 | The SentencePiece model uses the following mathematical formula for tokenization:
165 |
166 | \[P(w) = \prod_{i=1}^{n} P(w_i | w_1, ..., w_{i-1})\]
167 |
168 | Where:
169 | - \(P(w)\) is the probability of the word \(w\).
170 | - \(n\) is the number of subwords in the word \(w\).
171 | - \(w_i\) is the \(i\)-th subword of \(w\).
172 |
173 | The model is trained to maximize the likelihood of the training data, and the subwords are chosen to minimize the perplexity of the training data.
--------------------------------------------------------------------------------
/docs/zeta/training/nebula.md:
--------------------------------------------------------------------------------
1 | # Nebula
2 |
3 | The `Nebula` class is a custom loss function class that dynamically determines the most suitable loss function for a given dataset based on certain characteristics of the dataset, such as sparsity, correlation, range of values, and user input. It is part of the `zeta` library and is built upon PyTorch's LossFunction class.
4 |
5 | ## Introduction
6 |
7 | The purpose of the `Nebula` class is to help determine and cache the most suitable loss function for a given dataset without requiring the user to manually select one. This can be particularly useful in scenarios where the user is unsure of the most appropriate loss function to use or in automated systems where the type of problem (classification or regression) is not known a priori.
8 |
9 | The `Nebula` class considers various characteristics of the data, such as whether the target values are integers, the sparsity of the target values, the correlation between predictions and target values, and any user or domain knowledge provided, to determine whether the problem is a classification or regression problem and subsequently select an appropriate loss function.
10 |
11 | ## Class Definition
12 |
13 | ```python
14 | class Nebula(LossFunction):
15 | def __init__(self, domain_knowledge=None, user_input=None):
16 | ...
17 | ```
18 |
19 | ### Parameters
20 |
21 | - `domain_knowledge` (str, optional): Domain knowledge about the problem. It can be either "classification" or "regression". Default is `None`.
22 | - `user_input` (str, optional): User input about the problem type. It can be either "classification" or "regression". Default is `None`.
23 |
24 | ### Attributes
25 |
26 | - `loss_function`: The determined loss function.
27 | - `domain_knowledge`: Domain knowledge provided during initialization.
28 | - `user_input`: User input provided during initialization.
29 | - `loss_function_cache`: A cache for storing the determined loss function for a dataset.
30 | - `unique_values_cache`: A cache for storing the unique values in the target variable `y_true`.
31 | - `class_balance_cache`: A cache for storing the class balance in the target variable `y_true`.
32 | - `logger`: A logger for logging information during the determination of the loss function.
33 |
34 | ## Functionality and Usage
35 |
36 | The `Nebula` class is used to dynamically determine the most suitable loss function for a given dataset and cache the determined loss function for future use. The class analyzes the unique values, class balance, sparsity, and correlation of the target variable `y_true` and the predicted variable `y_pred` to determine whether the problem is a classification or regression problem and select an appropriate loss function.
37 |
38 | ### Method: `determine_loss_function`
39 |
40 | ```python
41 | def determine_loss_function(self, y_pred, y_true):
42 | ...
43 | ```
44 |
45 | This method determines the most suitable loss function based on the characteristics of `y_pred` and `y_true`.
46 |
47 | #### Parameters
48 |
49 | - `y_pred` (Tensor): The predicted values.
50 | - `y_true` (Tensor): The ground truth values.
51 |
52 | ### Method: `__call__`
53 |
54 | ```python
55 | def __call__(self, y_pred, y_true):
56 | ...
57 | ```
58 |
59 | This method computes the loss using the determined loss function.
60 |
61 | #### Parameters
62 |
63 | - `y_pred` (Tensor): The predicted values.
64 | - `y_true` (Tensor): The ground truth values.
65 |
66 | #### Returns
67 |
68 | - `Tensor`: The computed loss.
69 |
70 | ### Usage Examples
71 |
72 | #### Example 1: Basic Usage
73 |
74 | ```python
75 | from zeta import Nebula
76 | import torch
77 |
78 | # Initialize Nebula
79 | nebula = Nebula()
80 |
81 | # Generate some example data
82 | y_pred = torch.randn(10, 5)
83 | y_true = torch.randint(0, 5, (10,))
84 |
85 | # Compute the loss
86 | loss = nebula(y_pred, y_true)
87 |
88 | print(loss)
89 | ```
90 |
91 | #### Example 2: Providing Domain Knowledge
92 |
93 | ```python
94 | from zeta import Nebula
95 | import torch
96 |
97 | # Initialize Nebula with domain knowledge
98 | nebula = Nebula(domain_knowledge="classification")
99 |
100 | # Generate some example data
101 | y_pred = torch.randn(10, 5)
102 | y_true = torch.randint(0, 5, (10,))
103 |
104 | # Compute the loss
105 | loss = nebula(y_pred, y_true)
106 |
107 | print(loss)
108 | ```
109 |
110 | #### Example 3: Providing User Input
111 |
112 | ```python
113 | from zeta import Nebula
114 | import torch
115 |
116 | # Initialize Nebula with user input
117 | nebula = Nebula(user_input="regression")
118 |
119 | # Generate some example data
120 | y_pred = torch.randn(10, 1)
121 | y_true = torch.randn(10, 1)
122 |
123 | # Compute the loss
124 | loss = nebula(y_pred, y_true)
125 |
126 | print(loss)
127 | ```
128 |
129 | ## Mathematical Formula
130 |
131 | The `Nebula` class does not have a specific mathematical formula as it dynamically determines the most suitable loss function based on the characteristics of the data. However, the determined loss function will have its own mathematical formula, which can be found in the PyTorch documentation or the `zeta` library documentation.
132 |
133 | ## Additional Information and Tips
134 |
135 | - The `Nebula` class caches the determined loss function, unique values, and class balance for a given dataset to avoid recomputing them in the future.
136 | - If both `domain_knowledge` and `user_input` are provided, `domain_knowledge` will take precedence over `user_input`.
137 | - The `Nebula` class uses the `logging` module to log information during the determination of the loss function. You can customize the logging settings by modifying the `logger` attribute.
138 |
139 |
--------------------------------------------------------------------------------
/docs/zeta/training/optimizers/decoupled_lion.md:
--------------------------------------------------------------------------------
1 | # DecoupledLionW Optimizer
2 |
3 | ## Overview and Introduction
4 |
5 | `DecoupledLionW` is a PyTorch optimizer designed to improve training performance and convergence for deep learning models. It is an extension of the Lion optimizer, which incorporates decoupled weight decay and a momentum-based update rule.
6 |
7 | The optimizer utilizes the Adam-like update rule, where the weight decay is applied separately from the gradient update. This is crucial as it helps prevent overfitting, improves generalization, and aids faster convergence and smoother optimization.
8 |
9 | ### Key Concepts:
10 |
11 | - **Weight Decay:** Reduces the magnitude of the model's weights, preventing overfitting and improving generalization.
12 | - **Momentum Update:** An interpolation between the current gradient and the previous momentum state, allowing for faster convergence and smoother optimization.
13 | - **Momentum Decay:** Gradually reduces the momentum term over time, preventing it from becoming too large and destabilizing the optimization process.
14 |
15 | ## Class Definition
16 |
17 | ```python
18 | class DecoupledLionW(Optimizer):
19 | def __init__(
20 | self,
21 | params,
22 | lr: float = 1e-4,
23 | betas: Tuple[float, float] = (0.9, 0.99),
24 | weight_decay: float = 0.0,
25 | ):
26 | ```
27 |
28 | ### Parameters
29 |
30 | - `params` (iterable): Iterable of parameters to optimize or dictionaries defining parameter groups.
31 | - `lr` (float, optional): Learning rate. Default: 1e-4.
32 | - `betas` (Tuple[float, float], optional): Coefficients used for computing running averages of gradient and its square. Default: (0.9, 0.99).
33 | - `weight_decay` (float, optional): Weight decay (L2 penalty). Default: 0.
34 |
35 | ### Attributes
36 |
37 | - `metric_functions`: A dictionary of lambda functions to compute various metrics like L2 norm of moments, parameters, updates, and gradients, as well as cosine similarity between updates and gradients.
38 |
39 | ## Functionality and Usage
40 |
41 | ### `lionw` Method
42 |
43 | This static method is responsible for applying the weight decay, momentum update, and momentum decay.
44 |
45 | ```python
46 | @staticmethod
47 | def lionw(p, grad, exp_avg, lr, initial_lr, wd, beta1, beta2) -> None:
48 | ```
49 |
50 | #### Parameters
51 |
52 | - `p` (Tensor): Parameter tensor.
53 | - `grad` (Tensor): Gradient tensor.
54 | - `exp_avg` (Tensor): Exponential moving average of gradient values.
55 | - `lr` (float): Learning rate.
56 | - `initial_lr` (float): Initial learning rate.
57 | - `wd` (float): Weight decay.
58 | - `beta1` (float): Exponential decay rate for the first moment estimates.
59 | - `beta2` (float): Exponential decay rate for the second moment estimates.
60 |
61 | ### `step` Method
62 |
63 | Performs a single optimization step.
64 |
65 | ```python
66 | @torch.no_grad()
67 | def step(self, closure: Optional[Callable] = None):
68 | ```
69 |
70 | #### Parameters
71 |
72 | - `closure` (callable, optional): A closure that reevaluates the model and returns the loss.
73 |
74 | #### Returns
75 |
76 | - `loss` (float, optional): The loss value if `closure` is provided. None otherwise.
77 |
78 | ### `pre_reduce_metrics` Method
79 |
80 | This method preprocesses the metrics before reduction across nodes.
81 |
82 | ```python
83 | def pre_reduce_metrics(self, optimizer_metrics):
84 | ```
85 |
86 | #### Parameters
87 |
88 | - `optimizer_metrics` (dict): A dictionary containing the optimizer metrics.
89 |
90 | #### Returns
91 |
92 | - `optimizer_metrics` (dict): The pre-processed optimizer metrics.
93 |
94 | ### `report_per_parameter_metrics` Method
95 |
96 | This method reports the per-parameter metrics.
97 |
98 | ```python
99 | def report_per_parameter_metrics(self, param: torch.Tensor, name: str, optimizer_metrics: dict):
100 | ```
101 |
102 | #### Parameters
103 |
104 | - `param` (Tensor): Parameter tensor.
105 | - `name` (str): Name of the parameter.
106 | - `optimizer_metrics` (dict): A dictionary containing the optimizer metrics.
107 |
108 | #### Returns
109 |
110 | - `optimizer_metrics` (dict): The optimizer metrics with the reported per-parameter metrics.
111 |
112 | ## Usage Examples
113 |
114 | ```python
115 | from zeta import x
116 | import torch
117 |
118 | # Define model parameters
119 | params = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
120 |
121 | # Define optimizer
122 | optimizer = DecoupledLionW(params, lr=0.1, betas=(0.9, 0.999), weight_decay=0.01)
123 |
124 | # Define loss function
125 | loss_fn = torch.nn.MSELoss()
126 |
127 | # Forward pass
128 | output = x(params)
129 | target = torch.tensor([0.0, 1.0, 2.0])
130 | loss = loss_fn(output, target)
131 |
132 | # Backward pass
133 | loss.backward()
134 |
135 | # Optimization step
136 | optimizer.step()
137 | ```
138 |
139 | ## Mathematical Formula
140 |
141 | The update rule of the optimizer can be represented by the following formula:
142 |
143 | \[ p = p - \alpha \cdot \text{sign}(\beta_1 \cdot m + (1-\beta_1) \cdot g) - \eta \cdot wd \]
144 |
145 | Where:
146 |
147 | - \( p \) is the parameter.
148 | - \( \alpha \) is the learning rate.
149 | - \( \beta_1 \) is the exponential decay rate for the first moment estimates.
150 | - \( m \) is the momentum (exponential moving average of gradient values).
151 | - \( g \) is the gradient.
152 | - \( \eta \) is the decay factor.
153 | - \( wd \) is the weight decay.
154 |
155 | ## Additional Information and Tips
156 |
157 | - A high value of `weight_decay` can lead to a large reduction in the model's weights on every step. Ensure to use an appropriate value for your specific use case.
158 | - The optimizer supports both single-node and multi-node distributed training, enabling efficient training on parallel computing environments.
159 |
--------------------------------------------------------------------------------
/docs/zeta/training/optimizers/sophia.md:
--------------------------------------------------------------------------------
1 | # SophiaG Optimizer for Zeta Library
2 |
3 | ## Overview
4 |
5 | The SophiaG optimizer is designed to adaptively change learning rates during training, offering a combination of momentum-based acceleration and second-order Hessian-based adaptive learning rates. This optimizer is particularly useful for training deep neural networks and optimizing complex, non-convex loss functions. Key features include:
6 |
7 | 1. **Momentum**: Utilizes exponentially moving averages of gradients.
8 | 2. **Adaptive Learning Rate**: Adjusts the learning rate based on the second-order Hessian information.
9 | 3. **Regularization**: Applies weight decay to avoid overfitting.
10 | 4. **Optional Settings**: Allows for maximizing the loss function, customizable settings for capturable and dynamic parameters.
11 |
12 | ## Class Definition
13 |
14 | ```python
15 | class SophiaG(Optimizer):
16 | def __init__(self, params, lr=1e-4, betas=(0.965, 0.99), rho=0.04,
17 | weight_decay=1e-1, *, maximize: bool = False,
18 | capturable: bool = False, dynamic: bool = False):
19 | ```
20 |
21 | ### Parameters:
22 |
23 | - `params` (iterable): Iterable of parameters to optimize.
24 | - `lr` (float, default=1e-4): Learning rate.
25 | - `betas` (Tuple[float, float], default=(0.965, 0.99)): Coefficients used for computing running averages of gradient and Hessian.
26 | - `rho` (float, default=0.04): Damping factor for Hessian-based updates.
27 | - `weight_decay` (float, default=1e-1): Weight decay factor.
28 | - `maximize` (bool, default=False): Whether to maximize the loss function.
29 | - `capturable` (bool, default=False): Enable/Disable special capturing features.
30 | - `dynamic` (bool, default=False): Enable/Disable dynamic adjustments of the optimizer.
31 |
32 | ## Usage and Functionality
33 |
34 | ### 1. Initialization
35 |
36 | Upon initialization, the optimizer performs validation on its parameters and sets them as the default parameters for parameter groups.
37 |
38 | ```python
39 | from zeta import SophiaG
40 |
41 | optimizer = SophiaG(model.parameters(), lr=0.01, betas=(0.9, 0.999), weight_decay=1e-4)
42 | ```
43 |
44 | ### 2. Step Forward
45 |
46 | The `.step()` method updates the model parameters. The function is decorated with `@torch.no_grad()` to avoid saving any more computation graphs for gradient computation.
47 |
48 | ```python
49 | loss = criterion(output, target)
50 | loss.backward()
51 | optimizer.step()
52 | ```
53 |
54 | ### 3. Update Hessian and Exponential Average
55 |
56 | The optimizer has internal methods to update the Hessian and Exponential Moving Average (EMA) of the gradients, controlled by `betas`.
57 |
58 | ### 4. SophiaG Function
59 |
60 | The core SophiaG function updates the parameters based on the gradient (`grad`), moving average (`exp_avg`), and Hessian (`hessian`). It uses the following update formula:
61 |
62 | \[ \text{param} = \text{param} - \text{lr} \times \left( \text{beta}_1 \times \text{exp_avg} + \frac{(1-\text{beta}_1) \times \text{grad}}{( \text{beta}_2 \times \text{hessian} + (1-\text{beta}_2) )^{\rho}} \right) \]
63 |
64 | ## Usage Examples
65 |
66 | ### 1. Basic Usage:
67 |
68 | ```python
69 | from zeta import SophiaG
70 | import torch
71 | import torch.nn as nn
72 |
73 | model = nn.Linear(10, 1)
74 | optimizer = SophiaG(model.parameters(), lr=0.01)
75 | ```
76 |
77 | ### 2. Customizing Betas and Learning Rate:
78 |
79 | ```python
80 | from zeta import SophiaG
81 | import torch
82 |
83 | optimizer = SophiaG(model.parameters(), lr=0.001, betas=(0.9, 0.999))
84 | ```
85 |
86 | ### 3. Using with Weight Decay:
87 |
88 | ```python
89 | from zeta import SophiaG
90 |
91 | optimizer = SophiaG(model.parameters(), lr=0.01, weight_decay=1e-4)
92 | ```
93 |
94 | ## Additional Information and Tips
95 |
96 | - Make sure that the parameters passed are compatible with the model you are using.
97 | - To maximize the loss function (useful in adversarial training), set `maximize=True`.
98 |
99 | ## Common Issues
100 |
101 | - If sparse gradients are involved, the SophiaG optimizer is not applicable.
102 |
103 | ## References and Resources
104 |
105 | - [Adaptive Learning Rates](https://arxiv.org/pdf/1609.04747)
106 | - [Zeta Documentation](https://zeta.apac.ai)
107 |
108 | For further questions or issues, visit our [GitHub repository](https://github.com/kyegomez/zeta).
109 |
--------------------------------------------------------------------------------
/docs/zeta/training/train.md:
--------------------------------------------------------------------------------
1 | # Documentation for `Trainer` Module from Zeta Library
2 |
3 | ---
4 |
5 | ## Introduction
6 |
7 | The `Trainer` module from the Zeta library provides an easy-to-use, flexible, and scalable approach to training deep learning models. By abstracting away many of the lower-level details of training, including distributed training, gradient accumulation, and model checkpointing, `Trainer` allows developers to focus on the high-level aspects of model development and experimentation.
8 |
9 | This module also integrates seamlessly with the HuggingFace `Accelerator` to enable mixed precision training, GPU acceleration, and distributed training across multiple nodes or GPUs.
10 |
11 | ---
12 |
13 | ## `Trainer` Class Definition
14 |
15 | ```python
16 | def Trainer(
17 | gradient_accumulate_every: int = None,
18 | batch_size: int = None,
19 | seq_len: int = None,
20 | entity_name: str = None,
21 | model = None,
22 | use_fsdp: bool = False,
23 | use_activation_checkpointing: bool = False,
24 | learning_rate = None,
25 | seed = None,
26 | use_pretokenized: bool = False,
27 | resume_from_checkpoint = None,
28 | checkpointing_steps = None,
29 | output_dir = None,
30 | weight_decay = None,
31 | use_deepspeed = None
32 | ):
33 | ```
34 |
35 | ### Parameters
36 |
37 | - `gradient_accumulate_every` (`int`, optional): Specifies how often to accumulate gradients. Default: `None`.
38 | - `batch_size` (`int`, optional): Specifies the batch size for training. Default: `None`.
39 | - `seq_len` (`int`, optional): Sequence length for model inputs. Default: `None`.
40 | - `entity_name` (`str`, optional): Name of the entity for logging purposes. Default: `None`.
41 | - `model`: The model to train. No default value.
42 | - `use_fsdp` (`bool`, optional): Whether or not to use Fully Sharded Data Parallelism (FSDP). Default: `False`.
43 | - `use_activation_checkpointing` (`bool`, optional): Use activation checkpointing to save memory during training. Default: `False`.
44 | - `learning_rate`: The learning rate for training. No default value.
45 | - `seed`: Random seed for reproducibility. No default value.
46 | - `use_pretokenized` (`bool`, optional): Whether to use pre-tokenized data. Default: `False`.
47 | - `resume_from_checkpoint`: Path to a checkpoint to resume training from. Default: `None`.
48 | - `checkpointing_steps`: How often to save model checkpoints. Default: `None`.
49 | - `output_dir`: Directory to save final trained model and checkpoints. Default: `None`.
50 | - `weight_decay`: Weight decay value for regularization. No default value.
51 | - `use_deepspeed`: Whether to use deepspeed for training optimization. Default: `None`.
52 |
53 | ---
54 |
55 | ## Functionality and Usage
56 |
57 | The primary function of the `Trainer` module is to handle the training process, including data loading, optimization, and model updates. It leverages HuggingFace's `Accelerator` to provide accelerated training on GPUs and distributed environments.
58 |
59 | Here are the primary steps:
60 |
61 | 1. Initialization of the `Accelerator` for GPU training and gradient accumulation.
62 | 2. Model and optimizer initialization.
63 | 3. Loading datasets and setting up data loaders.
64 | 4. Training loop with gradient accumulation and model checkpointing.
65 | 5. Save the final trained model.
66 |
67 | ### Code Examples
68 |
69 | **1. Basic Usage**
70 |
71 | ```python
72 | from zeta import Trainer
73 |
74 | model = ... # Your model definition here
75 | Trainer(
76 | gradient_accumulate_every=2,
77 | batch_size=32,
78 | seq_len=128,
79 | model=model,
80 | learning_rate=0.001,
81 | seed=42,
82 | output_dir='./models/'
83 | )
84 | ```
85 |
86 | **2. Resuming Training from a Checkpoint**
87 |
88 | ```python
89 | from zeta import Trainer
90 |
91 | model = ... # Your model definition here
92 | Trainer(
93 | gradient_accumulate_every=2,
94 | batch_size=32,
95 | seq_len=128,
96 | model=model,
97 | learning_rate=0.001,
98 | seed=42,
99 | resume_from_checkpoint='./models/checkpoint.pt',
100 | output_dir='./models/'
101 | )
102 | ```
103 |
104 | **3. Using FSDP and Activation Checkpointing**
105 |
106 | ```python
107 | from zeta import Trainer
108 |
109 | model = ... # Your model definition here
110 | Trainer(
111 | gradient_accumulate_every=2,
112 | batch_size=32,
113 | seq_len=128,
114 | model=model,
115 | use_fsdp=True,
116 | use_activation_checkpointing=True,
117 | learning_rate=0.001,
118 | seed=42,
119 | output_dir='./models/'
120 | )
121 | ```
122 |
123 | ---
124 |
125 | ## Mathematical Description
126 |
127 | Given a dataset \( D \) consisting of data points \( \{ (x_1, y_1), (x_2, y_2), ... (x_N, y_N) \} \), the trainer aims to minimize the loss function \( L \) with respect to model parameters \( \theta \):
128 |
129 | \[ \theta^* = \arg\min_{\theta} \frac{1}{N} \sum_{i=1}^{N} L(f(x_i; \theta), y_i) \]
130 |
131 |
132 |
133 | where \( f \) is the model's prediction function.
134 |
135 | ---
136 |
137 | ## Conclusions
138 |
139 | The `Trainer` module from Zeta library streamlines the training process by abstracting away many complexities, making it a valuable tool for developers at all experience levels. Whether you are training a simple model or a complex architecture in a distributed environment, the `Trainer` module offers the flexibility and ease-of-use to get your models trained efficiently.
--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Swarm-Corporation/Multi-Agent-Template-App/dbb3ebd78a39b698068b2d4eae4365450fa05dbe/example.py
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Package Docs
2 | plugins:
3 | - glightbox
4 | - search
5 | copyright: "© APAC Corp, Inc."
6 | extra_css:
7 | - docs/assets/css/extra.css
8 | extra:
9 | # analytics:
10 | # provider: google
11 | # property: G-QM8EDPSCB6
12 | social:
13 | - icon: fontawesome/solid/house
14 | link: assets/img/ZetaLogoIcon.png
15 | - icon: fontawesome/brands/discord
16 | link: https://discord.gg/qUtxnK2NMf
17 | - icon: fontawesome/brands/github
18 | link: https://github.com/kyegomez/Zeta/
19 | - icon: fontawesome/brands/python
20 | link: https://pypi.org/project/Zeta/
21 | theme:
22 | name: material
23 | custom_dir: docs/overrides
24 | logo: assets/img/ZetaLogoIcon.png
25 | palette:
26 | # Palette toggle for light mode
27 | - scheme: default
28 | primary: 'custom'
29 | toggle:
30 | icon: material/brightness-7
31 | name: Switch to dark mode
32 | # Palette toggle for dark mode
33 | - scheme: slate
34 | primary: 'custom'
35 | accent: light blue
36 | toggle:
37 | icon: material/brightness-4
38 | name: Switch to light mode
39 | features:
40 | - content.code.copy
41 | - content.code.annotate
42 | - navigation.tabs
43 | - navigation.sections
44 | - navigation.expand
45 | - navigation.top
46 | - announce.dismiss
47 | font:
48 | text: Roboto
49 | code: Roboto Mono
50 |
51 | extra_css:
52 | - stylesheets/extra.css
53 |
54 | markdown_extensions:
55 | - pymdownx.highlight:
56 | anchor_linenums: true
57 | line_spans: __span
58 | pygments_lang_class: true
59 | - admonition
60 | - pymdownx.inlinehilite
61 | - pymdownx.snippets
62 | - pymdownx.superfences
63 | - pymdownx.details
64 | - pymdownx.tabbed
65 | - tables
66 | - def_list
67 | - footnotes
68 |
69 |
70 | nav:
71 | - Home:
72 | - Overview: "index.md"
73 | - Contributing: "contributing.md"
74 | - FAQ: "faq.md"
75 | - Purpose: "purpose.md"
76 | - Roadmap: "roadmap.md"
77 | - Design: "design.md"
78 | - Flywheel: "flywheel.md"
79 | - Bounties: "bounties.md"
80 | - Metric: "metric.md"
81 | - Distribution: "distribution"
82 | - Research: "research.md"
83 | - Demos: "demos.md"
84 | - Architecture: "architecture.md"
85 | - Checklist: "checklist.md"
86 | - Hiring: "hiring.md"
87 | - Zeta:
88 | - Overview: "zeta/index.md"
89 | - zeta.nn:
90 | - zeta.nn.biases:
91 | - Xpos: "zeta/nn/biases/xpos.md"
92 | - RelativePositionBias: "zeta/nn/biases/relative_bias.md"
93 | - AlibiPositionalBias: "zeta/nn/biases/alibi.md"
94 | - zeta.nn.embeddings:
95 | - MultiWay: "zeta/nn/embeddings/multiway.md"
96 | - RotaryEmbeddings: "zeta/nn/embeddings/rope.md"
97 | - TruncatedRotaryEmbedding: "zeta/nn/embeddings/truncated_rope.md"
98 | - zeta.nn.modules:
99 | - Lora: "zeta/nn/modules/lora.md"
100 | - TokenLearner: "zeta/nn/modules/token_learner.md"
101 | - zeta.nn.attention:
102 | - FlashAttention: "zeta/nn/attention/flash_attention.md"
103 | - MultiQueryAttention: "zeta/nn/attention/multiquery.md"
104 | - MultiheadAttention: "zeta/nn/attention/multihead.md"
105 | - FlashAttentionTwo: "zeta/nn/attention/flash2.md"
106 | - BaseAttention: "zeta/nn/attention/base.md"
107 | - zeta.nn.architecture:
108 | - Decoder: "zeta/nn/architecture/decoder.md"
109 | - Transformer: "zeta/nn/architecture/transformer.md"
110 | - zeta.training:
111 | - train: "zeta/training/train.md"
112 | - zeta.training.loss:
113 | - Nebula: "zeta/training/nebula.md"
114 | - zeta.training.optimizers:
115 | - DecoupledLionW: "zeta/training/optimizers/decoupled_lion.md"
116 | - SophiaG: "zeta/training/optimizers/sophia.md"
117 | - zeta.tokenizers:
118 | - MultiModalTokenizer: "zeta/tokenizers/multi_modal_tokenizer.md"
119 | - LanguageTokenizerGPTX: "zeta/tokenizers/language_tokenizer.md"
120 | - SentencePieceTokenizer: "zeta/tokenizers/sentencepiece.md"
121 | - Examples:
122 | - Overview: "examples/index.md"
123 | - FlashAttention: "examples/nn/attentions/flash.md"
124 |
--------------------------------------------------------------------------------
/package/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Swarm-Corporation/Multi-Agent-Template-App/dbb3ebd78a39b698068b2d4eae4365450fa05dbe/package/__init__.py
--------------------------------------------------------------------------------
/package/main.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Swarm-Corporation/Multi-Agent-Template-App/dbb3ebd78a39b698068b2d4eae4365450fa05dbe/package/main.py
--------------------------------------------------------------------------------
/package/subfolder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Swarm-Corporation/Multi-Agent-Template-App/dbb3ebd78a39b698068b2d4eae4365450fa05dbe/package/subfolder/__init__.py
--------------------------------------------------------------------------------
/package/subfolder/main.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Swarm-Corporation/Multi-Agent-Template-App/dbb3ebd78a39b698068b2d4eae4365450fa05dbe/package/subfolder/main.py
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["poetry-core>=1.0.0"]
3 | build-backend = "poetry.core.masonry.api"
4 |
5 | [tool.poetry]
6 | name = "paper"
7 | version = "0.0.1"
8 | description = "Paper - Pytorch"
9 | license = "MIT"
10 | authors = ["Kye Gomez "]
11 | homepage = "https://github.com/kyegomez/paper"
12 | documentation = "https://github.com/kyegomez/paper" # Add this if you have documentation.
13 | readme = "README.md" # Assuming you have a README.md
14 | repository = "https://github.com/kyegomez/paper"
15 | keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering"]
16 | classifiers = [
17 | "Development Status :: 4 - Beta",
18 | "Intended Audience :: Developers",
19 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
20 | "License :: OSI Approved :: MIT License",
21 | "Programming Language :: Python :: 3.9"
22 | ]
23 |
24 | [tool.poetry.dependencies]
25 | python = "^3.10"
26 | swarms = "*"
27 | pydantic = "*"
28 | fastapi = "*"
29 |
30 | [tool.poetry.group.lint.dependencies]
31 | ruff = "^0.6.2"
32 | types-toml = "^0.10.8.1"
33 | types-redis = "^4.3.21.6"
34 | types-pytz = "^2024.1.0.20240417"
35 | black = "^24.4.2"
36 | types-chardet = "^5.0.4.6"
37 | mypy-protobuf = "^3.0.0"
38 |
39 |
40 | [tool.autopep8]
41 | max_line_length = 80
42 | ignore = "E501,W6" # or ["E501", "W6"]
43 | in-place = true
44 | recursive = true
45 | aggressive = 3
46 |
47 |
48 | [tool.ruff]
49 | line-length = 70
50 |
51 | [tool.black]
52 | line-length = 70
53 | target-version = ['py38']
54 | preview = true
55 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | zetascale
3 | swarms
4 |
--------------------------------------------------------------------------------
/scripts/code_quality.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Navigate to the directory containing the 'package' folder
4 | # cd /path/to/your/code/directory
5 |
6 | # Run autopep8 with max aggressiveness (-aaa) and in-place modification (-i)
7 | # on all Python files (*.py) under the 'package' directory.
8 | autopep8 --in-place --aggressive --aggressive --recursive --experimental --list-fixes package/
9 |
10 | # Run black with default settings, since black does not have an aggressiveness level.
11 | # Black will format all Python files it finds in the 'package' directory.
12 | black --experimental-string-processing package/
13 |
14 | # Run ruff on the 'package' directory.
15 | # Add any additional flags if needed according to your version of ruff.
16 | ruff --unsafe_fix
17 |
18 | # YAPF
19 | yapf --recursive --in-place --verbose --style=google --parallel package
20 |
--------------------------------------------------------------------------------
/scripts/merge_all_prs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Check if we are inside a Git repository
4 | if ! git rev-parse --git-dir > /dev/null 2>&1; then
5 | echo "Error: Must be run inside a Git repository."
6 | exit 1
7 | fi
8 |
9 | # Fetch all open pull requests
10 | echo "Fetching open PRs..."
11 | prs=$(gh pr list --state open --json number --jq '.[].number')
12 |
13 | # Check if there are PRs to merge
14 | if [ -z "$prs" ]; then
15 | echo "No open PRs to merge."
16 | exit 0
17 | fi
18 |
19 | echo "Found PRs: $prs"
20 |
21 | # Loop through each pull request number and merge it
22 | for pr in $prs; do
23 | echo "Attempting to merge PR #$pr"
24 | merge_output=$(gh pr merge $pr --auto --merge)
25 | merge_status=$?
26 | if [ $merge_status -ne 0 ]; then
27 | echo "Failed to merge PR #$pr. Error: $merge_output"
28 | else
29 | echo "Successfully merged PR #$pr"
30 | fi
31 | done
32 |
33 | echo "Processing complete."
34 |
--------------------------------------------------------------------------------
/scripts/test_name.sh:
--------------------------------------------------------------------------------
1 | find ./tests -name "*.py" -type f | while read file
2 | do
3 | filename=$(basename "$file")
4 | dir=$(dirname "$file")
5 | if [[ $filename != test_* ]]; then
6 | mv "$file" "$dir/test_$filename"
7 | fi
8 | done
--------------------------------------------------------------------------------
/scripts/tests.sh:
--------------------------------------------------------------------------------
1 | find ./tests -name '*.py' -exec pytest {} \;
--------------------------------------------------------------------------------