├── .gitignore ├── README.md ├── client ├── .eslintrc.json ├── .gitignore ├── README.md ├── apollo-client.ts ├── app │ ├── contact │ │ ├── Contact.tsx │ │ └── page.tsx │ ├── favicon.ico │ ├── globals.css │ ├── layout.tsx │ ├── middleware.ts │ ├── models │ │ ├── Models.tsx │ │ └── page.tsx │ ├── page.tsx │ ├── pods │ │ └── page.tsx │ └── train │ │ ├── page.tsx │ │ └── train.tsx ├── components.json ├── components │ ├── Evaluate.tsx │ ├── PodStatus.tsx │ ├── gradient.tsx │ └── ui │ │ ├── button.tsx │ │ ├── card.tsx │ │ ├── dialog.tsx │ │ ├── dropdown-menu.tsx │ │ ├── input.tsx │ │ ├── label.tsx │ │ ├── slider.tsx │ │ ├── textarea.tsx │ │ ├── toast.tsx │ │ ├── toaster.tsx │ │ └── use-toast.ts ├── lib │ ├── supabase.ts │ └── utils.ts ├── next.config.mjs ├── package.json ├── pnpm-lock.yaml ├── postcss.config.mjs ├── public │ └── fsdp.jpg ├── tailwind.config.ts ├── tsconfig.json └── utils │ ├── getPod.ts │ ├── getPods.ts │ ├── rentPod.ts │ └── stopPod.ts ├── fault_tolerance ├── Dockerfile ├── ping.py ├── requirements.txt └── restarts.py ├── fsdp ├── Dockerfile ├── docker-compose.yml ├── fsdp_qlora │ ├── .gitignore │ ├── benchmarking │ │ ├── large_gpu_benchmarking.sh │ │ └── small_gpu_benchmarking.sh │ ├── fsdp_multi_node.sh │ ├── hf_train.py │ ├── nbs │ │ ├── 00-profile_lora_qlora.ipynb │ │ ├── 00-profile_lora_qlora_hqq.ipynb │ │ ├── 01-ft_benchmarking.ipynb │ │ ├── 02-qlora-memeff-loading.ipynb │ │ └── HQQ.ipynb │ ├── scripts │ │ ├── __init__.py │ │ ├── block_expansion.py │ │ ├── dora.py │ │ └── lora.py │ ├── table1.sh │ ├── tests │ │ ├── test_block_expansion.py │ │ └── test_dora.py │ ├── train.py │ ├── train.sh │ ├── train_hqq_bench.sh │ └── train_sql.sh └── train_llama.py ├── launcher.txt ├── parallel ├── Dockerfile ├── __pycache__ │ ├── config.cpython-310.pyc │ ├── config.cpython-311.pyc │ ├── dataset.cpython-310.pyc │ ├── dataset.cpython-311.pyc │ ├── fairscale.cpython-310.pyc │ ├── inference.cpython-311.pyc │ ├── main.cpython-310.pyc │ ├── model.cpython-310.pyc │ └── model.cpython-311.pyc ├── config.py ├── dataset.py ├── fairscale_test.py ├── fairscale_transformer.py ├── inference.py ├── interface.txt ├── main.py ├── mini_transformer_test.py ├── model.py ├── requirements.txt ├── tokenizer_en.json ├── tokenizer_it.json └── train.py ├── scheduler ├── Dockerfile ├── requirements.txt └── scheduler.py ├── setup.sh ├── test.py └── virtual_llm ├── Dockerfile ├── main.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | ENV/ 3 | .vscode/ 4 | .DS_Store 5 | .env 6 | .python-version 7 | *.safetensors 8 | fsdp/fsdp_qlora_2 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # podplex 2 | 3 | 🦾💻🌐 distributed training & serverless inference at scale (https://podplex.run) 4 | 5 | _built in < 24 hours at the [Runpod hackathon](https://partiful.com/e/PjgYh4cceTpxWN27i7ty) (co-hosted by Etched, Nomic, Replit, and vLLM)_ 6 | 7 | _🏆 UPDATE: This project won the $10,000 prize from Runpod at the hackathon! We feel humbled and are excited for the new things we can build with the credits._ 8 | 9 | ## our architecture 10 | 11 | - train & inference on **RunPod serverless**, **RunPod pods**, **RunPod network storage** 12 | - data visualizations on **Nomic** 13 | - frontend on **Vercel** 14 | - built with **Replit** 15 | 16 | ![Architecture Diagram](https://i.postimg.cc/dtgKdhQm/Screenshot-2024-05-19-at-11-12-32.png) 17 | 18 | ## Motivation 19 | 20 | The world isn’t facing a GPU supply problem, it’s facing a GPU under-utilization problem. 21 | For comparison, there’s 20-40 million GPUs on the Ethereum blockchain, 3 orders of magnitude more than the amount of GPUs used to train Llama 3, one of the largest open source training experiments of all time. 22 | 23 | However, tapping into this compute is tricky. Most individual provisioners of GPUs don’t have A100s or H100s connected with Nvlink and InfiniBand. 24 | 25 | That’s where PodPlex comes in. We integrate with decentralized cloud servers like RunPod and use distribution schemes like fully sharded data parallel to partition large models into shards, which can effectively fit and train on smaller devices. At TreeHacks, our team made an earlier prototype of this on a 4-layer DNN, where he hand-computed gradients and network connections. This past weekend, we focused on scaling this approach to natively integrate with PyTorch, allowing us to expand to more architectures without rewriting symbolics ourselves. 26 | 27 | ## Features 28 | 29 | - Train machine learning models across distributed spot instances with FSDP (Answer.AI implementation). 30 | - Automatic orchestration of Runpod pods for training via a custom Docker image 31 | - Uses Runpod Spot instances + Community Cloud to reduce cloud costs by up to 76% (benchmarked on GTX 4090’s). 32 | - Automatically handles restarts/failed nodes by using checkpoint backups. 33 | - Run eval benchmarks against trained models using Runpod serverless 34 | - Visualize evals in Nomic for quick feedback loops 35 | 36 | ![Nomic Viz](https://i.postimg.cc/mgPSHrp4/image.png) 37 | 38 | ## Code Overview 39 | 40 | ### client 41 | 42 | This contains the frontend for podplex, where you can start training and evaluation jobs. Uses Runpod GraphQL API to spin up pods. These pods then use custom docker images (defined by the `Dockerfile`) to train the model. 43 | 44 | ### fault_tolerance 45 | 46 | AWS lambda code for checking spot instance health and restarting pods if any shut down. 47 | 48 | ### fsdp 49 | 50 | Uses Fully Sharded Data Parallel methodology to train across multiple GPUs (see [AnswerDotAI implementation](https://github.com/AnswerDotAI/fsdp_qlora)). These pods use custom docker images to train the model. 51 | 52 | ### scheduler 53 | 54 | AWS lambda code that determines whether or not the fault_tolerance lambda should run. It accomplishes this by enabling and disabling an EventBridge rule. 55 | 56 | ### virtual_llm 57 | 58 | Runpod Serverless endpoint with vLLM’s for inference 59 | 60 | ### parallel 61 | 62 | Runpod Serverless endpoint with torch inference + experiments 63 | 64 | 65 | ## Getting Started 66 | Start training with `fsdp/train_llama.py.` Sample training command: 67 | 68 | ``` 69 | python train_llama.py \ 70 | --model_name meta-llama/Meta-Llama-Guard-2-8B \ 71 | --batch_size 2 \ 72 | --context_length 512 \ 73 | --precision bf16 \ 74 | --train_type qlora \ 75 | --use_gradient_checkpointing true \ 76 | --use_cpu_offload true \ 77 | --dataset alpaca \ 78 | --reentrant_checkpointing true 79 | ``` 80 | 81 | Note, that you need to request access from the Llama Page on HuggingFace to access the model. 82 | -------------------------------------------------------------------------------- /client/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "next/core-web-vitals" 3 | } 4 | -------------------------------------------------------------------------------- /client/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | .yarn/install-state.gz 8 | 9 | # testing 10 | /coverage 11 | 12 | # next.js 13 | /.next/ 14 | /out/ 15 | 16 | # production 17 | /build 18 | 19 | # misc 20 | .DS_Store 21 | *.pem 22 | 23 | # debug 24 | npm-debug.log* 25 | yarn-debug.log* 26 | yarn-error.log* 27 | 28 | # local env files 29 | .env*.local 30 | 31 | # vercel 32 | .vercel 33 | 34 | # typescript 35 | *.tsbuildinfo 36 | next-env.d.ts 37 | -------------------------------------------------------------------------------- /client/README.md: -------------------------------------------------------------------------------- 1 | This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app). 2 | 3 | ## Getting Started 4 | 5 | First, run the development server: 6 | 7 | ```bash 8 | npm run dev 9 | # or 10 | yarn dev 11 | # or 12 | pnpm dev 13 | # or 14 | bun dev 15 | ``` 16 | 17 | Open [http://localhost:3000](http://localhost:3000) with your browser to see the result. 18 | 19 | You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file. 20 | 21 | This project uses [`next/font`](https://nextjs.org/docs/basic-features/font-optimization) to automatically optimize and load Inter, a custom Google Font. 22 | 23 | ## Learn More 24 | 25 | To learn more about Next.js, take a look at the following resources: 26 | 27 | - [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API. 28 | - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial. 29 | 30 | You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js/) - your feedback and contributions are welcome! 31 | 32 | ## Deploy on Vercel 33 | 34 | The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js. 35 | 36 | Check out our [Next.js deployment documentation](https://nextjs.org/docs/deployment) for more details. 37 | -------------------------------------------------------------------------------- /client/apollo-client.ts: -------------------------------------------------------------------------------- 1 | import { ApolloClient, InMemoryCache, HttpLink } from "@apollo/client"; 2 | 3 | const API_KEY = process.env.NEXT_PUBLIC_RUNPOD_API_KEY; 4 | 5 | export const client = new ApolloClient({ 6 | link: new HttpLink({ 7 | uri: `https://api.runpod.io/graphql?api_key=${API_KEY}`, 8 | }), 9 | cache: new InMemoryCache(), 10 | }); 11 | -------------------------------------------------------------------------------- /client/app/contact/Contact.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import { Label } from "@/components/ui/label"; 4 | import { Input } from "@/components/ui/input"; 5 | import { Textarea } from "@/components/ui/textarea"; 6 | import { Button } from "@/components/ui/button"; 7 | import { useState } from "react"; 8 | import { createClient } from "@/lib/utils"; 9 | import { useToast } from "@/components/ui/use-toast"; 10 | 11 | const supabase = createClient(); 12 | 13 | export default function Contact() { 14 | const [name, setName] = useState(""); 15 | const [email, setEmail] = useState(""); 16 | const [message, setMessage] = useState(""); 17 | const { toast } = useToast(); 18 | 19 | const handleSubmit = async (e: React.FormEvent) => { 20 | e.preventDefault(); 21 | await supabase.from("messages").insert([{ name, email, message }]); 22 | setName(""); 23 | setEmail(""); 24 | setMessage(""); 25 | 26 | toast({ 27 | title: "Message sent!", 28 | description: "We'll get back to you as soon as possible.", 29 | }); 30 | }; 31 | 32 | return ( 33 |
34 |
35 |

Get in Touch

36 |

37 | Have a question or want to work together? Fill out the form below and 38 | we'll get back to you as soon as possible. 39 |

40 |
41 |
42 |
43 | 44 | setName(e.target.value)} 49 | /> 50 |
51 |
52 | 53 | setEmail(e.target.value)} 59 | /> 60 |
61 |
62 | 63 |