├── .gitignore
├── README.md
├── client
    ├── .eslintrc.json
    ├── .gitignore
    ├── README.md
    ├── apollo-client.ts
    ├── app
    │   ├── contact
    │   │   ├── Contact.tsx
    │   │   └── page.tsx
    │   ├── favicon.ico
    │   ├── globals.css
    │   ├── layout.tsx
    │   ├── middleware.ts
    │   ├── models
    │   │   ├── Models.tsx
    │   │   └── page.tsx
    │   ├── page.tsx
    │   ├── pods
    │   │   └── page.tsx
    │   └── train
    │   │   ├── page.tsx
    │   │   └── train.tsx
    ├── components.json
    ├── components
    │   ├── Evaluate.tsx
    │   ├── PodStatus.tsx
    │   ├── gradient.tsx
    │   └── ui
    │   │   ├── button.tsx
    │   │   ├── card.tsx
    │   │   ├── dialog.tsx
    │   │   ├── dropdown-menu.tsx
    │   │   ├── input.tsx
    │   │   ├── label.tsx
    │   │   ├── slider.tsx
    │   │   ├── textarea.tsx
    │   │   ├── toast.tsx
    │   │   ├── toaster.tsx
    │   │   └── use-toast.ts
    ├── lib
    │   ├── supabase.ts
    │   └── utils.ts
    ├── next.config.mjs
    ├── package.json
    ├── pnpm-lock.yaml
    ├── postcss.config.mjs
    ├── public
    │   └── fsdp.jpg
    ├── tailwind.config.ts
    ├── tsconfig.json
    └── utils
    │   ├── getPod.ts
    │   ├── getPods.ts
    │   ├── rentPod.ts
    │   └── stopPod.ts
├── fault_tolerance
    ├── Dockerfile
    ├── ping.py
    ├── requirements.txt
    └── restarts.py
├── fsdp
    ├── Dockerfile
    ├── docker-compose.yml
    ├── fsdp_qlora
    │   ├── .gitignore
    │   ├── benchmarking
    │   │   ├── large_gpu_benchmarking.sh
    │   │   └── small_gpu_benchmarking.sh
    │   ├── fsdp_multi_node.sh
    │   ├── hf_train.py
    │   ├── nbs
    │   │   ├── 00-profile_lora_qlora.ipynb
    │   │   ├── 00-profile_lora_qlora_hqq.ipynb
    │   │   ├── 01-ft_benchmarking.ipynb
    │   │   ├── 02-qlora-memeff-loading.ipynb
    │   │   └── HQQ.ipynb
    │   ├── scripts
    │   │   ├── __init__.py
    │   │   ├── block_expansion.py
    │   │   ├── dora.py
    │   │   └── lora.py
    │   ├── table1.sh
    │   ├── tests
    │   │   ├── test_block_expansion.py
    │   │   └── test_dora.py
    │   ├── train.py
    │   ├── train.sh
    │   ├── train_hqq_bench.sh
    │   └── train_sql.sh
    └── train_llama.py
├── launcher.txt
├── parallel
    ├── Dockerfile
    ├── __pycache__
    │   ├── config.cpython-310.pyc
    │   ├── config.cpython-311.pyc
    │   ├── dataset.cpython-310.pyc
    │   ├── dataset.cpython-311.pyc
    │   ├── fairscale.cpython-310.pyc
    │   ├── inference.cpython-311.pyc
    │   ├── main.cpython-310.pyc
    │   ├── model.cpython-310.pyc
    │   └── model.cpython-311.pyc
    ├── config.py
    ├── dataset.py
    ├── fairscale_test.py
    ├── fairscale_transformer.py
    ├── inference.py
    ├── interface.txt
    ├── main.py
    ├── mini_transformer_test.py
    ├── model.py
    ├── requirements.txt
    ├── tokenizer_en.json
    ├── tokenizer_it.json
    └── train.py
├── scheduler
    ├── Dockerfile
    ├── requirements.txt
    └── scheduler.py
├── setup.sh
├── test.py
└── virtual_llm
    ├── Dockerfile
    ├── main.py
    └── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | venv/
2 | ENV/
3 | .vscode/
4 | .DS_Store
5 | .env
6 | .python-version
7 | *.safetensors
8 | fsdp/fsdp_qlora_2


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # podplex
 2 | 
 3 | 🦾💻🌐 distributed training & serverless inference at scale (https://podplex.run)
 4 | 
 5 | _built in < 24 hours at the [Runpod hackathon](https://partiful.com/e/PjgYh4cceTpxWN27i7ty) (co-hosted by Etched, Nomic, Replit, and vLLM)_
 6 | 
 7 | _🏆 UPDATE: This project won the $10,000 prize from Runpod at the hackathon! We feel humbled and are excited for the new things we can build with the credits._
 8 | 
 9 | ## our architecture
10 | 
11 | - train & inference on **RunPod serverless**, **RunPod pods**, **RunPod network storage**
12 | - data visualizations on **Nomic**
13 | - frontend on **Vercel**
14 | - built with **Replit**
15 | 
16 | ![Architecture Diagram](https://i.postimg.cc/dtgKdhQm/Screenshot-2024-05-19-at-11-12-32.png)
17 | 
18 | ## Motivation
19 | 
20 | The world isn’t facing a GPU supply problem, it’s facing a GPU under-utilization problem.
21 | For comparison, there’s 20-40 million GPUs on the Ethereum blockchain, 3 orders of magnitude more than the amount of GPUs used to train Llama 3, one of the largest open source training experiments of all time.
22 | 
23 | However, tapping into this compute is tricky. Most individual provisioners of GPUs don’t have A100s or H100s connected with Nvlink and InfiniBand.
24 | 
25 | That’s where PodPlex comes in. We integrate with decentralized cloud servers like RunPod and use distribution schemes like fully sharded data parallel to partition large models into shards, which can effectively fit and train on smaller devices. At TreeHacks, our team made an earlier prototype of this on a 4-layer DNN, where he hand-computed gradients and network connections. This past weekend, we focused on scaling this approach to natively integrate with PyTorch, allowing us to expand to more architectures without rewriting symbolics ourselves.
26 | 
27 | ## Features
28 | 
29 | - Train machine learning models across distributed spot instances with FSDP (Answer.AI implementation).
30 | - Automatic orchestration of Runpod pods for training via a custom Docker image
31 | - Uses Runpod Spot instances + Community Cloud to reduce cloud costs by up to 76% (benchmarked on GTX 4090’s).
32 | - Automatically handles restarts/failed nodes by using checkpoint backups.
33 | - Run eval benchmarks against trained models using Runpod serverless
34 | - Visualize evals in Nomic for quick feedback loops
35 | 
36 | ![Nomic Viz](https://i.postimg.cc/mgPSHrp4/image.png)
37 | 
38 | ## Code Overview
39 | 
40 | ### client
41 | 
42 | This contains the frontend for podplex, where you can start training and evaluation jobs. Uses Runpod GraphQL API to spin up pods. These pods then use custom docker images (defined by the `Dockerfile`) to train the model.
43 | 
44 | ### fault_tolerance
45 | 
46 | AWS lambda code for checking spot instance health and restarting pods if any shut down.
47 | 
48 | ### fsdp
49 | 
50 | Uses Fully Sharded Data Parallel methodology to train across multiple GPUs (see [AnswerDotAI implementation](https://github.com/AnswerDotAI/fsdp_qlora)). These pods use custom docker images to train the model.
51 | 
52 | ### scheduler
53 | 
54 | AWS lambda code that determines whether or not the fault_tolerance lambda should run. It accomplishes this by enabling and disabling an EventBridge rule.
55 | 
56 | ### virtual_llm
57 | 
58 | Runpod Serverless endpoint with vLLM’s for inference
59 | 
60 | ### parallel
61 | 
62 | Runpod Serverless endpoint with torch inference + experiments
63 | 
64 | 
65 | ## Getting Started
66 | Start training with `fsdp/train_llama.py.` Sample training command: 
67 | 
68 | ```
69 | python train_llama.py \
70 | --model_name meta-llama/Meta-Llama-Guard-2-8B \
71 | --batch_size 2 \
72 | --context_length 512 \
73 | --precision bf16 \
74 | --train_type qlora \
75 | --use_gradient_checkpointing true \
76 | --use_cpu_offload true \
77 | --dataset alpaca \
78 | --reentrant_checkpointing true
79 | ```
80 | 
81 | Note, that you need to request access from the Llama Page on HuggingFace to access the model.
82 | 


--------------------------------------------------------------------------------
/client/.eslintrc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": "next/core-web-vitals"
3 | }
4 | 


--------------------------------------------------------------------------------
/client/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 2 | 
 3 | # dependencies
 4 | /node_modules
 5 | /.pnp
 6 | .pnp.js
 7 | .yarn/install-state.gz
 8 | 
 9 | # testing
10 | /coverage
11 | 
12 | # next.js
13 | /.next/
14 | /out/
15 | 
16 | # production
17 | /build
18 | 
19 | # misc
20 | .DS_Store
21 | *.pem
22 | 
23 | # debug
24 | npm-debug.log*
25 | yarn-debug.log*
26 | yarn-error.log*
27 | 
28 | # local env files
29 | .env*.local
30 | 
31 | # vercel
32 | .vercel
33 | 
34 | # typescript
35 | *.tsbuildinfo
36 | next-env.d.ts
37 | 


--------------------------------------------------------------------------------
/client/README.md:
--------------------------------------------------------------------------------
 1 | This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).
 2 | 
 3 | ## Getting Started
 4 | 
 5 | First, run the development server:
 6 | 
 7 | ```bash
 8 | npm run dev
 9 | # or
10 | yarn dev
11 | # or
12 | pnpm dev
13 | # or
14 | bun dev
15 | ```
16 | 
17 | Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
18 | 
19 | You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file.
20 | 
21 | This project uses [`next/font`](https://nextjs.org/docs/basic-features/font-optimization) to automatically optimize and load Inter, a custom Google Font.
22 | 
23 | ## Learn More
24 | 
25 | To learn more about Next.js, take a look at the following resources:
26 | 
27 | - [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API.
28 | - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
29 | 
30 | You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js/) - your feedback and contributions are welcome!
31 | 
32 | ## Deploy on Vercel
33 | 
34 | The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js.
35 | 
36 | Check out our [Next.js deployment documentation](https://nextjs.org/docs/deployment) for more details.
37 | 


--------------------------------------------------------------------------------
/client/apollo-client.ts:
--------------------------------------------------------------------------------
 1 | import { ApolloClient, InMemoryCache, HttpLink } from "@apollo/client";
 2 | 
 3 | const API_KEY = process.env.NEXT_PUBLIC_RUNPOD_API_KEY;
 4 | 
 5 | export const client = new ApolloClient({
 6 |   link: new HttpLink({
 7 |     uri: `https://api.runpod.io/graphql?api_key=${API_KEY}`,
 8 |   }),
 9 |   cache: new InMemoryCache(),
10 | });
11 | 


--------------------------------------------------------------------------------
/client/app/contact/Contact.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import { Label } from "@/components/ui/label";
 4 | import { Input } from "@/components/ui/input";
 5 | import { Textarea } from "@/components/ui/textarea";
 6 | import { Button } from "@/components/ui/button";
 7 | import { useState } from "react";
 8 | import { createClient } from "@/lib/utils";
 9 | import { useToast } from "@/components/ui/use-toast";
10 | 
11 | const supabase = createClient();
12 | 
13 | export default function Contact() {
14 |   const [name, setName] = useState("");
15 |   const [email, setEmail] = useState("");
16 |   const [message, setMessage] = useState("");
17 |   const { toast } = useToast();
18 | 
19 |   const handleSubmit = async (e: React.FormEvent<HTMLFormElement>) => {
20 |     e.preventDefault();
21 |     await supabase.from("messages").insert([{ name, email, message }]);
22 |     setName("");
23 |     setEmail("");
24 |     setMessage("");
25 | 
26 |     toast({
27 |       title: "Message sent!",
28 |       description: "We'll get back to you as soon as possible.",
29 |     });
30 |   };
31 | 
32 |   return (
33 |     <div className="flex flex-col items-start gap-6 mt-12">
34 |       <div className="space-y-2">
35 |         <h2 className="text-2xl font-bold md:text-3xl">Get in Touch</h2>
36 |         <p className="text-gray-500 dark:text-gray-400">
37 |           Have a question or want to work together? Fill out the form below and
38 |           we&apos;ll get back to you as soon as possible.
39 |         </p>
40 |       </div>
41 |       <form className="w-full max-w-md space-y-4" onSubmit={handleSubmit}>
42 |         <div className="grid gap-2">
43 |           <Label htmlFor="name">Name</Label>
44 |           <Input
45 |             id="name"
46 |             placeholder="Enter your name"
47 |             value={name}
48 |             onChange={(e) => setName(e.target.value)}
49 |           />
50 |         </div>
51 |         <div className="grid gap-2">
52 |           <Label htmlFor="email">Email</Label>
53 |           <Input
54 |             id="email"
55 |             placeholder="Enter your email"
56 |             type="email"
57 |             value={email}
58 |             onChange={(e) => setEmail(e.target.value)}
59 |           />
60 |         </div>
61 |         <div className="grid gap-2">
62 |           <Label htmlFor="message">Message</Label>
63 |           <Textarea
64 |             id="message"
65 |             placeholder="Enter your message"
66 |             value={message}
67 |             onChange={(e) => setMessage(e.target.value)}
68 |           />
69 |         </div>
70 |         <Button
71 |           className="w-full"
72 |           disabled={!name || !email || !message}
73 |           type="submit"
74 |         >
75 |           Submit
76 |         </Button>
77 |       </form>
78 |     </div>
79 |   );
80 | }
81 | 


--------------------------------------------------------------------------------
/client/app/contact/page.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import { Gradient } from "@/components/gradient";
 3 | import { useEffect, useRef } from "react";
 4 | import Contact from "./Contact";
 5 | 
 6 | export default function ContactPage() {
 7 |   const canvasRef = useRef<HTMLCanvasElement>(null);
 8 | 
 9 |   useEffect(() => {
10 |     const gradient = new Gradient();
11 |     // @ts-ignore
12 |     gradient.initGradient("#gradient-canvas");
13 |   }, []);
14 | 
15 |   return (
16 |     <main className="h-[100vh] flex flex-col-reverse md:flex-row">
17 |       <div className="flex-1 hidden md:flex items-center justify-center">
18 |         <canvas ref={canvasRef} id="gradient-canvas" />
19 |       </div>
20 |       <div className="flex-1 flex flex-col gap-4 py-4 px-10">
21 |         <h1 className="text-7xl">
22 |           pod
23 |           <span className="font-bold">plex</span>
24 |         </h1>
25 |         <p className="inline-flex gap-4 items-center text-sm opacity-75">
26 |           powered by{" "}
27 |           <img
28 |             src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcThC8wcQBVw7rJ6HeejZ1sOe0i5ntA_tLb8u2ysHvL1&s"
29 |             className="h-[2em]"
30 |           />
31 |           <img
32 |             src="https://mms.businesswire.com/media/20211208005150/en/933943/23/nomic+logo_dark3x.jpg"
33 |             className="h-[2em]"
34 |           />
35 |           <img
36 |             src="https://image4.owler.com/logo/replit_owler_20230425_184256_original.png"
37 |             className="h-[1.5em]"
38 |           />
39 |           <img
40 |             src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png"
41 |             className="h-[1em]"
42 |           />
43 |         </p>
44 |         <p>distributed training & serverless inference at scale</p>
45 |         <Contact />
46 |       </div>
47 |     </main>
48 |   );
49 | }
50 | 


--------------------------------------------------------------------------------
/client/app/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/client/app/favicon.ico


--------------------------------------------------------------------------------
/client/app/globals.css:
--------------------------------------------------------------------------------
 1 | #gradient-canvas {
 2 |   width:100%;
 3 |   height:100%;
 4 |   --gradient-color-1: #ec1b6b; 
 5 |   --gradient-color-2: #ff4800; 
 6 |   --gradient-color-3: #ae721f;  
 7 |   --gradient-color-4: #ff7300;
 8 | }
 9 | 
10 | 
11 | @tailwind base;
12 |   @tailwind components;
13 |   @tailwind utilities;
14 | 
15 |   @layer base {
16 |     :root {
17 |       --background: 0 0% 100%;
18 |       --foreground: 222.2 84% 4.9%;
19 | 
20 |       --card: 0 0% 100%;
21 |       --card-foreground: 222.2 84% 4.9%;
22 | 
23 |       --popover: 0 0% 100%;
24 |       --popover-foreground: 222.2 84% 4.9%;
25 | 
26 |       --primary: 222.2 47.4% 11.2%;
27 |       --primary-foreground: 210 40% 98%;
28 | 
29 |       --secondary: 210 40% 96.1%;
30 |       --secondary-foreground: 222.2 47.4% 11.2%;
31 | 
32 |       --muted: 210 40% 96.1%;
33 |       --muted-foreground: 215.4 16.3% 46.9%;
34 | 
35 |       --accent: 210 40% 96.1%;
36 |       --accent-foreground: 222.2 47.4% 11.2%;
37 | 
38 |       --destructive: 0 84.2% 60.2%;
39 |       --destructive-foreground: 210 40% 98%;
40 | 
41 |       --border: 214.3 31.8% 91.4%;
42 |       --input: 214.3 31.8% 91.4%;
43 |       --ring: 222.2 84% 4.9%;
44 | 
45 |       --radius: 0.5rem;
46 |     }
47 | 
48 |     .dark {
49 |       --background: 222.2 84% 4.9%;
50 |       --foreground: 210 40% 98%;
51 | 
52 |       --card: 222.2 84% 4.9%;
53 |       --card-foreground: 210 40% 98%;
54 | 
55 |       --popover: 222.2 84% 4.9%;
56 |       --popover-foreground: 210 40% 98%;
57 | 
58 |       --primary: 210 40% 98%;
59 |       --primary-foreground: 222.2 47.4% 11.2%;
60 | 
61 |       --secondary: 217.2 32.6% 17.5%;
62 |       --secondary-foreground: 210 40% 98%;
63 | 
64 |       --muted: 217.2 32.6% 17.5%;
65 |       --muted-foreground: 215 20.2% 65.1%;
66 | 
67 |       --accent: 217.2 32.6% 17.5%;
68 |       --accent-foreground: 210 40% 98%;
69 | 
70 |       --destructive: 0 62.8% 30.6%;
71 |       --destructive-foreground: 210 40% 98%;
72 | 
73 |       --border: 217.2 32.6% 17.5%;
74 |       --input: 217.2 32.6% 17.5%;
75 |       --ring: 212.7 26.8% 83.9%;
76 |     }
77 |   }
78 | 
79 |   @layer base {
80 |     * {
81 |       @apply border-border;
82 |     }
83 |     body {
84 |       @apply bg-background text-foreground;
85 |     }
86 |   }


--------------------------------------------------------------------------------
/client/app/layout.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import type { Metadata } from "next";
 4 | import { Inter as FontSans } from "next/font/google";
 5 | import "@mantine/core/styles.css";
 6 | import "./globals.css";
 7 | import { ColorSchemeScript, MantineProvider } from "@mantine/core";
 8 | import { ApolloProvider } from "@apollo/client";
 9 | import { client } from "../apollo-client";
10 | import { cn } from "@/lib/utils";
11 | import { Toaster } from "@/components/ui/toaster";
12 | 
13 | import "@mantine/dropzone/styles.css";
14 | 
15 | const fontSans = FontSans({
16 |   subsets: ["latin"],
17 |   variable: "--font-sans",
18 |   display: "swap",
19 | });
20 | // export const metadata: Metadata = {
21 | //   title: "Create Next App",
22 | //   description: "Generated by create next app",
23 | // };
24 | 
25 | export default function RootLayout({
26 |   children,
27 | }: {
28 |   children: React.ReactNode;
29 | }) {
30 |   return (
31 |     <html lang="en" suppressHydrationWarning className={`${fontSans.variable}`}>
32 |       <ApolloProvider client={client}>
33 |         <head>
34 |           <link rel="icon" href="/favicon.ico" sizes="any" />
35 |           <link
36 |             rel="apple-touch-icon"
37 |             href="/apple-touch-icon.png"
38 |             type="image/png"
39 |             sizes="180x180"
40 |           />
41 |           <ColorSchemeScript />
42 |         </head>
43 |         <body
44 |           className={cn(
45 |             "min-h-screen bg-background font-sans antialiased",
46 |             fontSans.variable
47 |           )}
48 |         >
49 |           <MantineProvider
50 |             theme={{
51 |               fontFamily: "var(--font-sans)",
52 |             }}
53 |           >
54 |             <div className="flex min-h-[calc(100vh_-_64px)]">{children}</div>
55 |           </MantineProvider>
56 |           <Toaster />
57 |         </body>
58 |       </ApolloProvider>
59 |     </html>
60 |   );
61 | }
62 | 


--------------------------------------------------------------------------------
/client/app/middleware.ts:
--------------------------------------------------------------------------------
 1 | import { type NextRequest } from "next/server";
 2 | import { updateSession } from "@/lib/supabase";
 3 | 
 4 | export async function middleware(request: NextRequest) {
 5 |   return await updateSession(request);
 6 | }
 7 | 
 8 | export const config = {
 9 |   matcher: [
10 |     /*
11 |      * Match all request paths except for the ones starting with:
12 |      * - _next/static (static files)
13 |      * - _next/image (image optimization files)
14 |      * - favicon.ico (favicon file)
15 |      * Feel free to modify this pattern to include more paths.
16 |      */
17 |     "/((?!_next/static|_next/image|favicon.ico|.*\\.(?:svg|png|jpg|jpeg|gif|webp)$).*)",
18 |   ],
19 | };
20 | 


--------------------------------------------------------------------------------
/client/app/models/Models.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import { useEffect, useState } from "react";
  4 | 
  5 | import Link from "next/link";
  6 | import Evaluate from "@/components/Evaluate";
  7 | import { Button } from "@/components/ui/button";
  8 | import { LuPlus } from "react-icons/lu";
  9 | import { createClient } from "@/lib/utils";
 10 | 
 11 | type Model = {
 12 |   name: string;
 13 |   hfName: string;
 14 |   metrics: string;
 15 |   baseUrl: string;
 16 | };
 17 | 
 18 | const supabase = createClient();
 19 | 
 20 | export default function Models() {
 21 |   const [open, setOpen] = useState(false);
 22 |   const [model, setModel] = useState<Model | null>(null);
 23 |   const [models, setModels] = useState<Model[]>([]);
 24 | 
 25 |   useEffect(() => {
 26 |     const fetchModels = async () => {
 27 |       const { data, error } = await supabase.from("models").select("*");
 28 |       if (error) {
 29 |         console.error(error);
 30 |         return;
 31 |       }
 32 | 
 33 |       setModels(data);
 34 |     };
 35 | 
 36 |     fetchModels();
 37 |   }, []);
 38 | 
 39 |   return (
 40 |     <main className="container mx-auto px-4 py-12 md:px-6 lg:py-16">
 41 |       <div className="mx-auto max-w-6xl">
 42 |         <div className="mb-8">
 43 |           <div className="flex justify-between">
 44 |             <h1 className="text-3xl font-bold tracking-tight md:text-4xl">
 45 |               Trained Models
 46 |             </h1>
 47 |             <Link href="/train">
 48 |               <Button className="gap-x-2">
 49 |                 Train a new model
 50 |                 <LuPlus className="h-4 w-4" />
 51 |               </Button>
 52 |             </Link>
 53 |           </div>
 54 |           <p className="mt-2 text-gray-500 dark:text-gray-400 font-inter">
 55 |             View details and metrics for the language models you&apos;ve
 56 |             trained.
 57 |           </p>
 58 |         </div>
 59 |         <div className="overflow-hidden rounded-lg border border-gray-200 dark:border-gray-800">
 60 |           <table className="w-full table-auto">
 61 |             <thead className="bg-gray-100 dark:bg-gray-800">
 62 |               <tr>
 63 |                 <th className="px-6 py-4 text-left font-medium text-gray-900 dark:text-gray-50 font-inter">
 64 |                   Model Name
 65 |                 </th>
 66 |                 <th className="px-6 py-4 text-left font-medium text-gray-900 dark:text-gray-50 font-inter">
 67 |                   Accuracy
 68 |                 </th>
 69 |                 <th className="px-6 py-4 text-right font-medium text-gray-900 dark:text-gray-50 font-inter"></th>
 70 |               </tr>
 71 |             </thead>
 72 |             <tbody className="divide-y divide-gray-200 dark:divide-gray-800">
 73 |               {models.length === 0 && (
 74 |                 <tr>
 75 |                   <td
 76 |                     className="px-6 py-4 text-gray-500 dark:text-gray-400 font-inter"
 77 |                     colSpan={3}
 78 |                   >
 79 |                     No models available.
 80 |                   </td>
 81 |                 </tr>
 82 |               )}
 83 |               {models.map((model) => (
 84 |                 <tr key={model.name}>
 85 |                   <td className="px-6 py-4 font-medium text-gray-900 dark:text-gray-50 font-inter">
 86 |                     {model.name}
 87 |                   </td>
 88 |                   <td className="px-6 py-4 text-gray-500 dark:text-gray-400 font-inter">
 89 |                     {model.metrics ?? "N/A"}
 90 |                   </td>
 91 |                 </tr>
 92 |               ))}
 93 |             </tbody>
 94 |           </table>
 95 |         </div>
 96 |       </div>
 97 |       <Evaluate open={open} setOpen={setOpen} model={model} />
 98 |     </main>
 99 |   );
100 | }
101 | 


--------------------------------------------------------------------------------
/client/app/models/page.tsx:
--------------------------------------------------------------------------------
 1 | import { redirect } from "next/navigation";
 2 | 
 3 | import { createClient } from "@/lib/supabase";
 4 | import Models from "./Models";
 5 | 
 6 | export default async function ModelPage() {
 7 |   const supabase = createClient();
 8 | 
 9 |   const { data, error } = await supabase.auth.getUser();
10 |   if (error || !data?.user) {
11 |     redirect("/contact");
12 |   }
13 | 
14 |   return <Models />;
15 | }
16 | 


--------------------------------------------------------------------------------
/client/app/page.tsx:
--------------------------------------------------------------------------------
 1 | 'use client'
 2 | import { Gradient } from '@/components/gradient'
 3 | import { useEffect, useRef } from 'react'
 4 | 
 5 | export default function MainPage() {
 6 |   const canvasRef = useRef<HTMLCanvasElement>(null)
 7 | 
 8 |   useEffect(() => {
 9 |     const gradient = new Gradient()
10 |     // @ts-ignore
11 |     gradient.initGradient('#gradient-canvas')
12 |   }, [])
13 | 
14 |   return (
15 |     <main className='h-[100vh] flex flex-col-reverse md:flex-row'>
16 |       <div className='flex-1 hidden md:flex items-center justify-center'>
17 |         <canvas ref={canvasRef} id='gradient-canvas' />
18 |       </div>
19 |       <div className='flex-1 flex flex-col gap-4 py-4 px-10'>
20 |         <h1 className='text-7xl'>
21 |           pod
22 |           <span className='font-bold'>plex</span>
23 |         </h1>
24 |         <p className='inline-flex gap-4 items-center text-sm opacity-75'>
25 |           powered by{' '}
26 |           <img src='https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcThC8wcQBVw7rJ6HeejZ1sOe0i5ntA_tLb8u2ysHvL1&s' className='h-[2em]' />
27 |           <img src='https://mms.businesswire.com/media/20211208005150/en/933943/23/nomic+logo_dark3x.jpg' className='h-[2em]' />
28 |           <img src='https://image4.owler.com/logo/replit_owler_20230425_184256_original.png' className='h-[1.5em]' />
29 |           <img src='https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png' className='h-[1em]' />
30 |         </p>
31 | 
32 |         <p>distributed training & serverless inference at scale</p>
33 |         <img src='/fsdp.jpg' width={600} className='self-center' style={{ width: '60%' }} />
34 | 
35 |         <p className='text-2xl'>how it works</p>
36 |         <ol className='list-decimal list-inside'>
37 |           <li>
38 |             <b>select a model</b>: choose a model from the list of available models to finetune
39 |           </li>
40 |           <li>
41 |             <b>hang tight! </b>
42 |             we&apos;ll use runpod + FSDP to distribute your training job across multiple GPUs.
43 |           </li>
44 |           <li>
45 |             <b>monitor your job</b>: track the progress of your training job in real-time using the pod status page
46 |           </li>
47 |         </ol>
48 |         <a href='/models' className='bg-black uppercase font-mono hover:bg-gray-800 self-start text-white p-4 rounded-md'>
49 |           enter &rarr;
50 |         </a>
51 |       </div>
52 |     </main>
53 |   )
54 | }
55 | 


--------------------------------------------------------------------------------
/client/app/pods/page.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import { useState, useEffect } from "react";
 4 | import { useLazyQuery } from "@apollo/client";
 5 | import PodStatus from "@/components/PodStatus";
 6 | import { GET_ALL_PODS } from "@/utils/getPods";
 7 | import { IoRefresh } from "react-icons/io5";
 8 | import { Button } from "@/components/ui/button";
 9 | import Link from "next/link";
10 | 
11 | export default function Home() {
12 |   const [prevAllPods, setPrevAllPods] = useState([]);
13 |   const [listAllPods, { data: allPodsData, loading: allPodsLoading }] =
14 |     useLazyQuery(GET_ALL_PODS, {
15 |       pollInterval: 5000,
16 |     });
17 | 
18 |   useEffect(() => {
19 |     listAllPods();
20 |   }, []);
21 | 
22 |   useEffect(() => {
23 |     if (allPodsData && allPodsData.myself?.pods?.length === 4) {
24 |       setPrevAllPods(allPodsData.myself.pods);
25 |     }
26 |   }, [allPodsData]);
27 | 
28 |   const listPods = async () => {
29 |     await listAllPods();
30 |   };
31 | 
32 |   return (
33 |     <div className="p-24 flex flex-col gap-y-8 w-full">
34 |       <div className="flex justify-between items-center w-full">
35 |         <p className="text-2xl">Pod Status</p>
36 |         <Button
37 |           className="flex gap-x-4"
38 |           onClick={listPods}
39 |           disabled={allPodsLoading}
40 |         >
41 |           Refresh
42 |           <IoRefresh size={16} />
43 |         </Button>
44 |       </div>
45 |       {prevAllPods.length === 0 && <NoPods />}
46 |       <div className="grid sm:grid-cols-2 grid-cols-1 gap-x-0 sm:gap-x-4 gap-y-3">
47 |         {prevAllPods.map((pod: any, index) => (
48 |           <PodStatus key={`${index}-${pod.id}`} pod={pod} />
49 |         ))}
50 |       </div>
51 |     </div>
52 |   );
53 | }
54 | 
55 | function NoPods() {
56 |   return (
57 |     <div className="mx-auto max-w-md space-y-4 text-center min-h-[50vh] flex flex-col items-center justify-center">
58 |       <PackageIcon className="mx-auto h-12 w-12 text-gray-500 dark:text-gray-400" />
59 |       <h2 className="text-2xl font-bold tracking-tight text-gray-900 dark:text-gray-50">
60 |         No Active Pods
61 |       </h2>
62 |       <p className="text-gray-500 dark:text-gray-400">
63 |         You don&apos;t have any active pods yet. Pods will automatically be
64 |         started when you start training a model.
65 |       </p>
66 |       <Link href="/train">
67 |         <Button>Train Model</Button>
68 |       </Link>
69 |     </div>
70 |   );
71 | }
72 | 
73 | function PackageIcon(props: any) {
74 |   return (
75 |     <svg
76 |       {...props}
77 |       xmlns="http://www.w3.org/2000/svg"
78 |       width="24"
79 |       height="24"
80 |       viewBox="0 0 24 24"
81 |       fill="none"
82 |       stroke="currentColor"
83 |       strokeWidth="2"
84 |       strokeLinecap="round"
85 |       strokeLinejoin="round"
86 |     >
87 |       <path d="m7.5 4.27 9 5.15" />
88 |       <path d="M21 8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16Z" />
89 |       <path d="m3.3 7 8.7 5 8.7-5" />
90 |       <path d="M12 22V12" />
91 |     </svg>
92 |   );
93 | }
94 | 


--------------------------------------------------------------------------------
/client/app/train/page.tsx:
--------------------------------------------------------------------------------
 1 | import { redirect } from "next/navigation";
 2 | 
 3 | import { createClient } from "@/lib/supabase";
 4 | import Train from "./train";
 5 | 
 6 | export default async function ModelPage() {
 7 |   const supabase = createClient();
 8 | 
 9 |   const { data, error } = await supabase.auth.getUser();
10 |   if (error || !data?.user) {
11 |     redirect("/contact");
12 |   }
13 | 
14 |   return <Train />;
15 | }
16 | 


--------------------------------------------------------------------------------
/client/app/train/train.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import { Label } from "@/components/ui/label";
  4 | import { Button } from "@/components/ui/button";
  5 | import { CardContent, Card } from "@/components/ui/card";
  6 | import { Input } from "@/components/ui/input";
  7 | import { useState } from "react";
  8 | import axios from "axios";
  9 | import { Dropzone } from "@mantine/dropzone";
 10 | import { FaTrash } from "react-icons/fa";
 11 | import { ActionIcon } from "@mantine/core";
 12 | import { useRouter } from "next/navigation";
 13 | 
 14 | export default function TrainPage() {
 15 |   const [dataSet, setDataSet] = useState<File | null>(null);
 16 |   const [loading, setLoading] = useState(false);
 17 |   const [training, setTraining] = useState(false);
 18 |   const [podNumber, setPodNumber] = useState("");
 19 |   const [batchSize, setBatchSize] = useState("");
 20 |   const [modelUrl, setModelUrl] = useState("");
 21 |   const [accessToken, setAccessToken] = useState("");
 22 |   const router = useRouter();
 23 | 
 24 |   const startTraining = async () => {
 25 |     setLoading(true);
 26 |     setTraining(true);
 27 | 
 28 |     try {
 29 |       await axios.post(
 30 |         process.env.NEXT_PUBLIC_RUNPOD_TRAINING_ENDPOINT! + "?action=start"
 31 |       );
 32 |       router.push("/pods");
 33 |     } catch (error) {
 34 |       console.error(error);
 35 |     } finally {
 36 |       setLoading(false);
 37 |     }
 38 |   };
 39 | 
 40 |   const stopTraining = async () => {
 41 |     setLoading(true);
 42 |     setTraining(false);
 43 | 
 44 |     try {
 45 |       await axios.post(
 46 |         process.env.NEXT_PUBLIC_RUNPOD_TRAINING_ENDPOINT! + "?action=stop"
 47 |       );
 48 |     } catch (error) {
 49 |       console.error(error);
 50 |     } finally {
 51 |       setLoading(false);
 52 |     }
 53 |   };
 54 | 
 55 |   return (
 56 |     <main className="container mx-auto px-4 py-12 md:px-6 lg:py-16">
 57 |       <div className="grid gap-8 lg:grid-cols-[1fr_2fr]">
 58 |         <div className="space-y-4 flex flex-col h-full lg:col-span-2">
 59 |           <div className="space-y-2 w-full">
 60 |             <h2 className="text-2xl font-bold tracking-tight">
 61 |               Dataset Selection
 62 |             </h2>
 63 |             <p className="text-gray-500 dark:text-gray-400">
 64 |               Upload a dataset to use for training.
 65 |             </p>
 66 |             {dataSet && (
 67 |               <div>
 68 |                 <p className="text-lg">Selected data set</p>
 69 |                 <div className="flex items-center gap-x-4">
 70 |                   <p className="text-sm font-light">{dataSet.name}</p>
 71 |                   <ActionIcon
 72 |                     variant="filled"
 73 |                     aria-label="Settings"
 74 |                     color="gray"
 75 |                     onClick={() => setDataSet(null)}
 76 |                   >
 77 |                     <FaTrash size={12} />
 78 |                   </ActionIcon>
 79 |                 </div>
 80 |               </div>
 81 |             )}
 82 |           </div>
 83 |           <Card className="flex flex-1 items-center justify-center p-6">
 84 |             <Dropzone
 85 |               className="w-full h-full"
 86 |               onDrop={(files) => setDataSet(files[0])}
 87 |               onReject={(files) => console.log("rejected files", files)}
 88 |               // accept={IMAGE_MIME_TYPE}
 89 |             >
 90 |               <div className="flex flex-col items-center justify-center space-y-4 min-h-[120px] w-full">
 91 |                 <div>
 92 |                   <p className="text-lg">Upload Custom Dataset</p>
 93 |                 </div>
 94 |               </div>
 95 |             </Dropzone>
 96 |           </Card>
 97 |         </div>
 98 |         <div className="space-y-4 lg:col-span-2">
 99 |           <div className="space-y-2">
100 |             <h2 className="text-2xl font-bold tracking-tight">
101 |               Model Selection
102 |             </h2>
103 |             <p className="text-gray-500 dark:text-gray-400">
104 |               Choose a pre-trained model or use a custom model.
105 |             </p>
106 |           </div>
107 |           <Card>
108 |             <CardContent className="space-y-3">
109 |               <Input
110 |                 id="model"
111 |                 placeholder="HuggingFace Model URL"
112 |                 value={modelUrl}
113 |                 onChange={(e) => setModelUrl(e.target.value)}
114 |               />
115 |               <Input
116 |                 id="accessToken"
117 |                 placeholder="HuggingFace Access Token"
118 |                 value={accessToken}
119 |                 onChange={(e) => setAccessToken(e.target.value)}
120 |                 type="password"
121 |                 autoComplete="off"
122 |               />
123 |             </CardContent>
124 |           </Card>
125 |         </div>
126 |         <div className="space-y-4 lg:col-span-2">
127 |           <div className="space-y-2">
128 |             <h2 className="text-2xl font-bold tracking-tight">
129 |               Training Configuration
130 |             </h2>
131 |             <p className="text-gray-500 dark:text-gray-400">
132 |               Set the training parameters and resources.
133 |             </p>
134 |           </div>
135 |           <Card>
136 |             <CardContent className="grid gap-4 md:grid-cols-2">
137 |               <div className="space-y-2">
138 |                 <Label htmlFor="pods">Pods</Label>
139 |                 <Input
140 |                   id="pods"
141 |                   placeholder="Number of pods"
142 |                   type="number"
143 |                   value={podNumber}
144 |                   onChange={(e) => setPodNumber(e.target.value)}
145 |                 />
146 |               </div>
147 |               <div className="space-y-2">
148 |                 <Label htmlFor="batch-size">Batch Size</Label>
149 |                 <Input
150 |                   id="batch-size"
151 |                   placeholder="Batch size"
152 |                   type="number"
153 |                   value={batchSize}
154 |                   onChange={(e) => setBatchSize(e.target.value)}
155 |                 />
156 |               </div>
157 |             </CardContent>
158 |           </Card>
159 |         </div>
160 |         <div className="space-y-4 lg:col-span-2">
161 |           <div className="space-y-2">
162 |             <h2 className="text-2xl font-bold tracking-tight">
163 |               Training Controls
164 |             </h2>
165 |             <p className="text-gray-500 dark:text-gray-400">
166 |               Monitor and control the training process.
167 |             </p>
168 |           </div>
169 |           <Card>
170 |             <CardContent className="flex flex-col gap-3 md:grid-cols-2">
171 |               <div className="flex gap-x-2">
172 |                 <Button
173 |                   className="w-full"
174 |                   disabled={
175 |                     loading ||
176 |                     training ||
177 |                     dataSet === null ||
178 |                     !modelUrl ||
179 |                     !podNumber ||
180 |                     !batchSize
181 |                   }
182 |                   onClick={startTraining}
183 |                 >
184 |                   Start Training
185 |                 </Button>
186 |                 <Button
187 |                   className="w-full"
188 |                   disabled={loading || !training}
189 |                   variant="destructive"
190 |                   onClick={stopTraining}
191 |                 >
192 |                   Stop Training
193 |                 </Button>
194 |               </div>
195 |             </CardContent>
196 |           </Card>
197 |         </div>
198 |       </div>
199 |     </main>
200 |   );
201 | }
202 | 


--------------------------------------------------------------------------------
/client/components.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://ui.shadcn.com/schema.json",
 3 |   "style": "new-york",
 4 |   "rsc": true,
 5 |   "tsx": true,
 6 |   "tailwind": {
 7 |     "config": "tailwind.config.ts",
 8 |     "css": "app/globals.css",
 9 |     "baseColor": "slate",
10 |     "cssVariables": true,
11 |     "prefix": ""
12 |   },
13 |   "aliases": {
14 |     "components": "@/components",
15 |     "utils": "@/lib/utils"
16 |   }
17 | }


--------------------------------------------------------------------------------
/client/components/Evaluate.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import { useRef, useState } from "react";
  4 | 
  5 | import { Button } from "@/components/ui/button";
  6 | import { DialogFooter } from "@/components/ui/dialog";
  7 | import { Label } from "@/components/ui/label";
  8 | import { Modal } from "@mantine/core";
  9 | import axios from "axios";
 10 | import { createClient } from "@/lib/utils";
 11 | 
 12 | const supabase = createClient();
 13 | 
 14 | export default function Evaluate({
 15 |   open,
 16 |   setOpen,
 17 |   model,
 18 | }: {
 19 |   open: boolean;
 20 |   setOpen: (open: boolean) => void;
 21 |   model: {
 22 |     name: string;
 23 |     hfName: string;
 24 |     metrics: string;
 25 |     baseUrl: string;
 26 |   } | null;
 27 | }) {
 28 |   const fileInputRef = useRef<HTMLInputElement>(null);
 29 |   const [evalSet, setEvalSet] = useState<File | null>(null);
 30 | 
 31 |   const handleButtonClick = () => {
 32 |     if (fileInputRef.current) {
 33 |       fileInputRef.current.click();
 34 |     }
 35 |   };
 36 | 
 37 |   const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
 38 |     const file = event.target.files?.[0];
 39 |     if (file) {
 40 |       setEvalSet(file);
 41 |     }
 42 |   };
 43 | 
 44 |   const handleSubmit = async () => {
 45 |     if (!evalSet) return;
 46 |     setOpen(false);
 47 | 
 48 |     // upload csv to supabase because runpod serverless has a max payload of 10mb
 49 |     const { data } = await supabase.storage
 50 |       .from("eval_files")
 51 |       .upload(`${evalSet.name}`, evalSet);
 52 | 
 53 |     // evaluate model
 54 |     await axios.post(process.env.NEXT_PUBLIC_RUNPOD_INFERENCE_ENDPOINT!, {
 55 |       input: {
 56 |         eval_file: data?.path,
 57 |         model_name: model?.name,
 58 |       },
 59 |     });
 60 |   };
 61 | 
 62 |   return (
 63 |     <Modal
 64 |       opened={open}
 65 |       onClose={() => {
 66 |         setOpen(false);
 67 |       }}
 68 |       title={`Evaluate ${model?.name}`}
 69 |       centered
 70 |     >
 71 |       <div className="grid gap-6 py-6">
 72 |         <div className="grid items-center gap-4">
 73 |           <Label className="text-left font-medium" htmlFor="file">
 74 |             Evaluation File (.csv)
 75 |           </Label>
 76 |           <div className="col-span-1 flex items-center gap-2">
 77 |             <input
 78 |               ref={fileInputRef}
 79 |               type="file"
 80 |               className="hidden"
 81 |               accept=".csv"
 82 |               onChange={handleFileChange}
 83 |             />
 84 |             <Button
 85 |               className="w-full"
 86 |               variant="outline"
 87 |               onClick={handleButtonClick}
 88 |             >
 89 |               Upload Custom Evaluations
 90 |             </Button>
 91 |           </div>
 92 |         </div>
 93 |       </div>
 94 |       <DialogFooter>
 95 |         <Button
 96 |           variant={"destructive"}
 97 |           onClick={() => {
 98 |             setOpen(false);
 99 |           }}
100 |         >
101 |           Cancel
102 |         </Button>
103 |         <Button onClick={handleSubmit}>Evaluate</Button>
104 |       </DialogFooter>
105 |     </Modal>
106 |   );
107 | }
108 | 


--------------------------------------------------------------------------------
/client/components/PodStatus.tsx:
--------------------------------------------------------------------------------
  1 | import {
  2 |   CardTitle,
  3 |   CardDescription,
  4 |   CardHeader,
  5 |   CardContent,
  6 |   Card,
  7 | } from "@/components/ui/card";
  8 | import { LuCircuitBoard, LuNetwork } from "react-icons/lu";
  9 | 
 10 | type Props = {
 11 |   pod: any;
 12 | };
 13 | 
 14 | function formatTime(seconds: number) {
 15 |   const days = Math.floor(seconds / (24 * 3600));
 16 |   seconds %= 24 * 3600;
 17 |   const hours = Math.floor(seconds / 3600);
 18 |   seconds %= 3600;
 19 |   const minutes = Math.floor(seconds / 60);
 20 |   seconds %= 60;
 21 | 
 22 |   const results = [];
 23 |   if (days > 0) results.push(`${days} day${days > 1 ? "s" : ""}`);
 24 |   if (hours > 0) results.push(`${hours} hour${hours > 1 ? "s" : ""}`);
 25 |   if (minutes > 0) results.push(`${minutes} minute${minutes > 1 ? "s" : ""}`);
 26 |   if (seconds > 0) results.push(`${seconds} second${seconds > 1 ? "s" : ""}`);
 27 | 
 28 |   // Return the largest two units
 29 |   if (results.length > 2) {
 30 |     return results.slice(0, 2).join(", ");
 31 |   } else {
 32 |     return results.join(", ");
 33 |   }
 34 | }
 35 | 
 36 | export default function PodStatus({ pod }: Props) {
 37 |   return (
 38 |     <Card className="w-full">
 39 |       <CardHeader className="flex flex-col items-start gap-2 md:flex-row md:items-center md:justify-between">
 40 |         <div>
 41 |           <CardTitle className="text-2xl font-bold">Pod {pod.id}</CardTitle>
 42 |           <CardDescription className="text-gray-500 dark:text-gray-400">
 43 |             Uptime: {formatTime(pod?.runtime?.uptimeInSeconds ?? 0)}
 44 |           </CardDescription>
 45 |         </div>
 46 |       </CardHeader>
 47 |       <CardContent className="grid gap-8">
 48 |         <div className="grid gap-6">
 49 |           <div className="grid gap-2">
 50 |             <h3 className="text-xl font-semibold">GPUs</h3>
 51 |             <div className="grid sm:grid-cols-2 gap-4">
 52 |               {pod?.runtime?.gpus.map((gpu: any) => (
 53 |                 <div
 54 |                   key={gpu.id}
 55 |                   className="bg-gray-100 dark:bg-gray-800 p-4 rounded-lg shadow-sm transition-shadow duration-300 hover:shadow-md"
 56 |                 >
 57 |                   <div className="flex items-center justify-between">
 58 |                     <div className="font-medium text-lg">{gpu.id}</div>
 59 |                     <LuCircuitBoard className="h-6 w-6 min-w-6 text-gray-500 dark:text-gray-400" />
 60 |                   </div>
 61 |                   <div className="text-gray-600 dark:text-gray-400 text-sm">
 62 |                     Utilization: {gpu.gpuUtilPercent}%
 63 |                   </div>
 64 |                   <div className="text-gray-600 dark:text-gray-400 text-sm">
 65 |                     Memory: {gpu.memoryUtilPercent}%
 66 |                   </div>
 67 |                 </div>
 68 |               ))}
 69 |             </div>
 70 |           </div>
 71 |           <div className="grid gap-2">
 72 |             <h3 className="text-xl font-semibold">Ports</h3>
 73 |             <div className="grid sm:grid-cols-2 gap-4">
 74 |               {pod?.runtime?.ports.map((port: any, index: number) => (
 75 |                 <div
 76 |                   key={`${index}-${port.ip}`}
 77 |                   className="bg-gray-100 dark:bg-gray-800 p-4 rounded-lg shadow-sm transition-shadow duration-300 hover:shadow-md"
 78 |                 >
 79 |                   <div className="flex items-center justify-between">
 80 |                     <div className="font-medium text-lg">Port {index}</div>
 81 |                     <LuNetwork className="h-6 w-6 min-w-6 text-gray-500 dark:text-gray-400" />
 82 |                   </div>
 83 |                   <div className="text-gray-600 dark:text-gray-400 text-sm">
 84 |                     IP: {port.ip}
 85 |                   </div>
 86 |                   <div className="text-gray-600 dark:text-gray-400 text-sm">
 87 |                     Public IP: {port.isIpPublic ? "Yes" : "No"}
 88 |                   </div>
 89 |                   <div className="text-gray-600 dark:text-gray-400 text-sm">
 90 |                     Private Port: {port.privatePort}
 91 |                   </div>
 92 |                   <div className="text-gray-600 dark:text-gray-400 text-sm">
 93 |                     Public Port: {port.publicPort}
 94 |                   </div>
 95 |                   <div className="text-gray-600 dark:text-gray-400 text-sm flex gap-x-1">
 96 |                     Type: <p className="uppercase">{port.type}</p>
 97 |                   </div>
 98 |                 </div>
 99 |               ))}
100 |             </div>
101 |           </div>
102 |         </div>
103 |       </CardContent>
104 |     </Card>
105 |   );
106 | }
107 | 


--------------------------------------------------------------------------------
/client/components/ui/button.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react"
 2 | import { Slot } from "@radix-ui/react-slot"
 3 | import { cva, type VariantProps } from "class-variance-authority"
 4 | 
 5 | import { cn } from "@/lib/utils"
 6 | 
 7 | const buttonVariants = cva(
 8 |   "inline-flex items-center justify-center whitespace-nowrap rounded-md text-sm font-medium transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50",
 9 |   {
10 |     variants: {
11 |       variant: {
12 |         default:
13 |           "bg-primary text-primary-foreground shadow hover:bg-primary/90",
14 |         destructive:
15 |           "bg-destructive text-destructive-foreground shadow-sm hover:bg-destructive/90",
16 |         outline:
17 |           "border border-input bg-background shadow-sm hover:bg-accent hover:text-accent-foreground",
18 |         secondary:
19 |           "bg-secondary text-secondary-foreground shadow-sm hover:bg-secondary/80",
20 |         ghost: "hover:bg-accent hover:text-accent-foreground",
21 |         link: "text-primary underline-offset-4 hover:underline",
22 |       },
23 |       size: {
24 |         default: "h-9 px-4 py-2",
25 |         sm: "h-8 rounded-md px-3 text-xs",
26 |         lg: "h-10 rounded-md px-8",
27 |         icon: "h-9 w-9",
28 |       },
29 |     },
30 |     defaultVariants: {
31 |       variant: "default",
32 |       size: "default",
33 |     },
34 |   }
35 | )
36 | 
37 | export interface ButtonProps
38 |   extends React.ButtonHTMLAttributes<HTMLButtonElement>,
39 |     VariantProps<typeof buttonVariants> {
40 |   asChild?: boolean
41 | }
42 | 
43 | const Button = React.forwardRef<HTMLButtonElement, ButtonProps>(
44 |   ({ className, variant, size, asChild = false, ...props }, ref) => {
45 |     const Comp = asChild ? Slot : "button"
46 |     return (
47 |       <Comp
48 |         className={cn(buttonVariants({ variant, size, className }))}
49 |         ref={ref}
50 |         {...props}
51 |       />
52 |     )
53 |   }
54 | )
55 | Button.displayName = "Button"
56 | 
57 | export { Button, buttonVariants }
58 | 


--------------------------------------------------------------------------------
/client/components/ui/card.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | 
 3 | import { cn } from "@/lib/utils";
 4 | 
 5 | const Card = React.forwardRef<
 6 |   HTMLDivElement,
 7 |   React.HTMLAttributes<HTMLDivElement>
 8 | >(({ className, ...props }, ref) => (
 9 |   <div
10 |     ref={ref}
11 |     className={cn(
12 |       "rounded-xl border bg-card text-card-foreground",
13 |       className
14 |     )}
15 |     {...props}
16 |   />
17 | ));
18 | Card.displayName = "Card";
19 | 
20 | const CardHeader = React.forwardRef<
21 |   HTMLDivElement,
22 |   React.HTMLAttributes<HTMLDivElement>
23 | >(({ className, ...props }, ref) => (
24 |   <div
25 |     ref={ref}
26 |     className={cn("flex flex-col space-y-1.5 p-6", className)}
27 |     {...props}
28 |   />
29 | ));
30 | CardHeader.displayName = "CardHeader";
31 | 
32 | const CardTitle = React.forwardRef<
33 |   HTMLParagraphElement,
34 |   React.HTMLAttributes<HTMLHeadingElement>
35 | >(({ className, ...props }, ref) => (
36 |   <h3
37 |     ref={ref}
38 |     className={cn("font-semibold leading-none tracking-tight", className)}
39 |     {...props}
40 |   />
41 | ));
42 | CardTitle.displayName = "CardTitle";
43 | 
44 | const CardDescription = React.forwardRef<
45 |   HTMLParagraphElement,
46 |   React.HTMLAttributes<HTMLParagraphElement>
47 | >(({ className, ...props }, ref) => (
48 |   <p
49 |     ref={ref}
50 |     className={cn("text-sm text-muted-foreground", className)}
51 |     {...props}
52 |   />
53 | ));
54 | CardDescription.displayName = "CardDescription";
55 | 
56 | const CardContent = React.forwardRef<
57 |   HTMLDivElement,
58 |   React.HTMLAttributes<HTMLDivElement>
59 | >(({ className, ...props }, ref) => (
60 |   <div ref={ref} className={cn("p-6", className)} {...props} />
61 | ));
62 | CardContent.displayName = "CardContent";
63 | 
64 | const CardFooter = React.forwardRef<
65 |   HTMLDivElement,
66 |   React.HTMLAttributes<HTMLDivElement>
67 | >(({ className, ...props }, ref) => (
68 |   <div
69 |     ref={ref}
70 |     className={cn("flex items-center p-6 pt-0", className)}
71 |     {...props}
72 |   />
73 | ));
74 | CardFooter.displayName = "CardFooter";
75 | 
76 | export {
77 |   Card,
78 |   CardHeader,
79 |   CardFooter,
80 |   CardTitle,
81 |   CardDescription,
82 |   CardContent,
83 | };
84 | 


--------------------------------------------------------------------------------
/client/components/ui/dialog.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import * as React from "react";
  4 | import * as DialogPrimitive from "@radix-ui/react-dialog";
  5 | import { Cross2Icon } from "@radix-ui/react-icons";
  6 | 
  7 | import { cn } from "@/lib/utils";
  8 | 
  9 | const Dialog = DialogPrimitive.Root;
 10 | 
 11 | const DialogTrigger = DialogPrimitive.Trigger;
 12 | 
 13 | const DialogPortal = DialogPrimitive.Portal;
 14 | 
 15 | const DialogClose = DialogPrimitive.Close;
 16 | 
 17 | const DialogOverlay = React.forwardRef<
 18 |   React.ElementRef<typeof DialogPrimitive.Overlay>,
 19 |   React.ComponentPropsWithoutRef<typeof DialogPrimitive.Overlay>
 20 | >(({ className, ...props }, ref) => (
 21 |   <DialogPrimitive.Overlay
 22 |     ref={ref}
 23 |     className={cn(
 24 |       "fixed inset-0 z-50 bg-black/80  data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0",
 25 |       className
 26 |     )}
 27 |     {...props}
 28 |   />
 29 | ));
 30 | DialogOverlay.displayName = DialogPrimitive.Overlay.displayName;
 31 | 
 32 | const DialogContent = React.forwardRef<
 33 |   React.ElementRef<typeof DialogPrimitive.Content>,
 34 |   React.ComponentPropsWithoutRef<typeof DialogPrimitive.Content>
 35 | >(({ className, children, ...props }, ref) => (
 36 |   <DialogPortal>
 37 |     <DialogOverlay />
 38 |     <DialogPrimitive.Content
 39 |       ref={ref}
 40 |       className={cn(
 41 |         "fixed left-[50%] top-[50%] z-50 grid w-full max-w-lg translate-x-[-50%] translate-y-[-50%] gap-4 border bg-background p-6 shadow-lg duration-200 data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[state=closed]:slide-out-to-left-1/2 data-[state=closed]:slide-out-to-top-[48%] data-[state=open]:slide-in-from-left-1/2 data-[state=open]:slide-in-from-top-[48%] sm:rounded-lg",
 42 |         className
 43 |       )}
 44 |       {...props}
 45 |     >
 46 |       {children}
 47 |       <DialogPrimitive.Close
 48 |         className="absolute right-4 top-4 rounded-sm opacity-70 ring-offset-background transition-opacity hover:opacity-100 focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2 disabled:pointer-events-none data-[state=open]:bg-accent data-[state=open]:text-muted-foreground"
 49 |       >
 50 |         <Cross2Icon className="h-4 w-4" />
 51 |         <span className="sr-only">Close</span>
 52 |       </DialogPrimitive.Close>
 53 |     </DialogPrimitive.Content>
 54 |   </DialogPortal>
 55 | ));
 56 | DialogContent.displayName = DialogPrimitive.Content.displayName;
 57 | 
 58 | const DialogHeader = ({
 59 |   className,
 60 |   ...props
 61 | }: React.HTMLAttributes<HTMLDivElement>) => (
 62 |   <div
 63 |     className={cn(
 64 |       "flex flex-col space-y-1.5 text-center sm:text-left",
 65 |       className
 66 |     )}
 67 |     {...props}
 68 |   />
 69 | );
 70 | DialogHeader.displayName = "DialogHeader";
 71 | 
 72 | const DialogFooter = ({
 73 |   className,
 74 |   ...props
 75 | }: React.HTMLAttributes<HTMLDivElement>) => (
 76 |   <div
 77 |     className={cn(
 78 |       "flex flex-col-reverse sm:flex-row sm:justify-end sm:space-x-2",
 79 |       className
 80 |     )}
 81 |     {...props}
 82 |   />
 83 | );
 84 | DialogFooter.displayName = "DialogFooter";
 85 | 
 86 | const DialogTitle = React.forwardRef<
 87 |   React.ElementRef<typeof DialogPrimitive.Title>,
 88 |   React.ComponentPropsWithoutRef<typeof DialogPrimitive.Title>
 89 | >(({ className, ...props }, ref) => (
 90 |   <DialogPrimitive.Title
 91 |     ref={ref}
 92 |     className={cn(
 93 |       "text-lg font-semibold leading-none tracking-tight",
 94 |       className
 95 |     )}
 96 |     {...props}
 97 |   />
 98 | ));
 99 | DialogTitle.displayName = DialogPrimitive.Title.displayName;
100 | 
101 | const DialogDescription = React.forwardRef<
102 |   React.ElementRef<typeof DialogPrimitive.Description>,
103 |   React.ComponentPropsWithoutRef<typeof DialogPrimitive.Description>
104 | >(({ className, ...props }, ref) => (
105 |   <DialogPrimitive.Description
106 |     ref={ref}
107 |     className={cn("text-sm text-muted-foreground", className)}
108 |     {...props}
109 |   />
110 | ));
111 | DialogDescription.displayName = DialogPrimitive.Description.displayName;
112 | 
113 | export {
114 |   Dialog,
115 |   DialogPortal,
116 |   DialogOverlay,
117 |   DialogTrigger,
118 |   DialogClose,
119 |   DialogContent,
120 |   DialogHeader,
121 |   DialogFooter,
122 |   DialogTitle,
123 |   DialogDescription,
124 | };
125 | 


--------------------------------------------------------------------------------
/client/components/ui/dropdown-menu.tsx:
--------------------------------------------------------------------------------
  1 | "use client"
  2 | 
  3 | import * as React from "react"
  4 | import * as DropdownMenuPrimitive from "@radix-ui/react-dropdown-menu"
  5 | import {
  6 |   CheckIcon,
  7 |   ChevronRightIcon,
  8 |   DotFilledIcon,
  9 | } from "@radix-ui/react-icons"
 10 | 
 11 | import { cn } from "@/lib/utils"
 12 | 
 13 | const DropdownMenu = DropdownMenuPrimitive.Root
 14 | 
 15 | const DropdownMenuTrigger = DropdownMenuPrimitive.Trigger
 16 | 
 17 | const DropdownMenuGroup = DropdownMenuPrimitive.Group
 18 | 
 19 | const DropdownMenuPortal = DropdownMenuPrimitive.Portal
 20 | 
 21 | const DropdownMenuSub = DropdownMenuPrimitive.Sub
 22 | 
 23 | const DropdownMenuRadioGroup = DropdownMenuPrimitive.RadioGroup
 24 | 
 25 | const DropdownMenuSubTrigger = React.forwardRef<
 26 |   React.ElementRef<typeof DropdownMenuPrimitive.SubTrigger>,
 27 |   React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.SubTrigger> & {
 28 |     inset?: boolean
 29 |   }
 30 | >(({ className, inset, children, ...props }, ref) => (
 31 |   <DropdownMenuPrimitive.SubTrigger
 32 |     ref={ref}
 33 |     className={cn(
 34 |       "flex cursor-default select-none items-center rounded-sm px-2 py-1.5 text-sm outline-none focus:bg-accent data-[state=open]:bg-accent",
 35 |       inset && "pl-8",
 36 |       className
 37 |     )}
 38 |     {...props}
 39 |   >
 40 |     {children}
 41 |     <ChevronRightIcon className="ml-auto h-4 w-4" />
 42 |   </DropdownMenuPrimitive.SubTrigger>
 43 | ))
 44 | DropdownMenuSubTrigger.displayName =
 45 |   DropdownMenuPrimitive.SubTrigger.displayName
 46 | 
 47 | const DropdownMenuSubContent = React.forwardRef<
 48 |   React.ElementRef<typeof DropdownMenuPrimitive.SubContent>,
 49 |   React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.SubContent>
 50 | >(({ className, ...props }, ref) => (
 51 |   <DropdownMenuPrimitive.SubContent
 52 |     ref={ref}
 53 |     className={cn(
 54 |       "z-50 min-w-[8rem] overflow-hidden rounded-md border bg-popover p-1 text-popover-foreground shadow-lg data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",
 55 |       className
 56 |     )}
 57 |     {...props}
 58 |   />
 59 | ))
 60 | DropdownMenuSubContent.displayName =
 61 |   DropdownMenuPrimitive.SubContent.displayName
 62 | 
 63 | const DropdownMenuContent = React.forwardRef<
 64 |   React.ElementRef<typeof DropdownMenuPrimitive.Content>,
 65 |   React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Content>
 66 | >(({ className, sideOffset = 4, ...props }, ref) => (
 67 |   <DropdownMenuPrimitive.Portal>
 68 |     <DropdownMenuPrimitive.Content
 69 |       ref={ref}
 70 |       sideOffset={sideOffset}
 71 |       className={cn(
 72 |         "z-50 min-w-[8rem] overflow-hidden rounded-md border bg-popover p-1 text-popover-foreground shadow-md",
 73 |         "data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",
 74 |         className
 75 |       )}
 76 |       {...props}
 77 |     />
 78 |   </DropdownMenuPrimitive.Portal>
 79 | ))
 80 | DropdownMenuContent.displayName = DropdownMenuPrimitive.Content.displayName
 81 | 
 82 | const DropdownMenuItem = React.forwardRef<
 83 |   React.ElementRef<typeof DropdownMenuPrimitive.Item>,
 84 |   React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Item> & {
 85 |     inset?: boolean
 86 |   }
 87 | >(({ className, inset, ...props }, ref) => (
 88 |   <DropdownMenuPrimitive.Item
 89 |     ref={ref}
 90 |     className={cn(
 91 |       "relative flex cursor-default select-none items-center rounded-sm px-2 py-1.5 text-sm outline-none transition-colors focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50",
 92 |       inset && "pl-8",
 93 |       className
 94 |     )}
 95 |     {...props}
 96 |   />
 97 | ))
 98 | DropdownMenuItem.displayName = DropdownMenuPrimitive.Item.displayName
 99 | 
100 | const DropdownMenuCheckboxItem = React.forwardRef<
101 |   React.ElementRef<typeof DropdownMenuPrimitive.CheckboxItem>,
102 |   React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.CheckboxItem>
103 | >(({ className, children, checked, ...props }, ref) => (
104 |   <DropdownMenuPrimitive.CheckboxItem
105 |     ref={ref}
106 |     className={cn(
107 |       "relative flex cursor-default select-none items-center rounded-sm py-1.5 pl-8 pr-2 text-sm outline-none transition-colors focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50",
108 |       className
109 |     )}
110 |     checked={checked}
111 |     {...props}
112 |   >
113 |     <span className="absolute left-2 flex h-3.5 w-3.5 items-center justify-center">
114 |       <DropdownMenuPrimitive.ItemIndicator>
115 |         <CheckIcon className="h-4 w-4" />
116 |       </DropdownMenuPrimitive.ItemIndicator>
117 |     </span>
118 |     {children}
119 |   </DropdownMenuPrimitive.CheckboxItem>
120 | ))
121 | DropdownMenuCheckboxItem.displayName =
122 |   DropdownMenuPrimitive.CheckboxItem.displayName
123 | 
124 | const DropdownMenuRadioItem = React.forwardRef<
125 |   React.ElementRef<typeof DropdownMenuPrimitive.RadioItem>,
126 |   React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.RadioItem>
127 | >(({ className, children, ...props }, ref) => (
128 |   <DropdownMenuPrimitive.RadioItem
129 |     ref={ref}
130 |     className={cn(
131 |       "relative flex cursor-default select-none items-center rounded-sm py-1.5 pl-8 pr-2 text-sm outline-none transition-colors focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50",
132 |       className
133 |     )}
134 |     {...props}
135 |   >
136 |     <span className="absolute left-2 flex h-3.5 w-3.5 items-center justify-center">
137 |       <DropdownMenuPrimitive.ItemIndicator>
138 |         <DotFilledIcon className="h-4 w-4 fill-current" />
139 |       </DropdownMenuPrimitive.ItemIndicator>
140 |     </span>
141 |     {children}
142 |   </DropdownMenuPrimitive.RadioItem>
143 | ))
144 | DropdownMenuRadioItem.displayName = DropdownMenuPrimitive.RadioItem.displayName
145 | 
146 | const DropdownMenuLabel = React.forwardRef<
147 |   React.ElementRef<typeof DropdownMenuPrimitive.Label>,
148 |   React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Label> & {
149 |     inset?: boolean
150 |   }
151 | >(({ className, inset, ...props }, ref) => (
152 |   <DropdownMenuPrimitive.Label
153 |     ref={ref}
154 |     className={cn(
155 |       "px-2 py-1.5 text-sm font-semibold",
156 |       inset && "pl-8",
157 |       className
158 |     )}
159 |     {...props}
160 |   />
161 | ))
162 | DropdownMenuLabel.displayName = DropdownMenuPrimitive.Label.displayName
163 | 
164 | const DropdownMenuSeparator = React.forwardRef<
165 |   React.ElementRef<typeof DropdownMenuPrimitive.Separator>,
166 |   React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Separator>
167 | >(({ className, ...props }, ref) => (
168 |   <DropdownMenuPrimitive.Separator
169 |     ref={ref}
170 |     className={cn("-mx-1 my-1 h-px bg-muted", className)}
171 |     {...props}
172 |   />
173 | ))
174 | DropdownMenuSeparator.displayName = DropdownMenuPrimitive.Separator.displayName
175 | 
176 | const DropdownMenuShortcut = ({
177 |   className,
178 |   ...props
179 | }: React.HTMLAttributes<HTMLSpanElement>) => {
180 |   return (
181 |     <span
182 |       className={cn("ml-auto text-xs tracking-widest opacity-60", className)}
183 |       {...props}
184 |     />
185 |   )
186 | }
187 | DropdownMenuShortcut.displayName = "DropdownMenuShortcut"
188 | 
189 | export {
190 |   DropdownMenu,
191 |   DropdownMenuTrigger,
192 |   DropdownMenuContent,
193 |   DropdownMenuItem,
194 |   DropdownMenuCheckboxItem,
195 |   DropdownMenuRadioItem,
196 |   DropdownMenuLabel,
197 |   DropdownMenuSeparator,
198 |   DropdownMenuShortcut,
199 |   DropdownMenuGroup,
200 |   DropdownMenuPortal,
201 |   DropdownMenuSub,
202 |   DropdownMenuSubContent,
203 |   DropdownMenuSubTrigger,
204 |   DropdownMenuRadioGroup,
205 | }
206 | 


--------------------------------------------------------------------------------
/client/components/ui/input.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react"
 2 | 
 3 | import { cn } from "@/lib/utils"
 4 | 
 5 | export interface InputProps
 6 |   extends React.InputHTMLAttributes<HTMLInputElement> {}
 7 | 
 8 | const Input = React.forwardRef<HTMLInputElement, InputProps>(
 9 |   ({ className, type, ...props }, ref) => {
10 |     return (
11 |       <input
12 |         type={type}
13 |         className={cn(
14 |           "flex h-9 w-full rounded-md border border-input bg-transparent px-3 py-1 text-sm shadow-sm transition-colors file:border-0 file:bg-transparent file:text-sm file:font-medium placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50",
15 |           className
16 |         )}
17 |         ref={ref}
18 |         {...props}
19 |       />
20 |     )
21 |   }
22 | )
23 | Input.displayName = "Input"
24 | 
25 | export { Input }
26 | 


--------------------------------------------------------------------------------
/client/components/ui/label.tsx:
--------------------------------------------------------------------------------
 1 | "use client"
 2 | 
 3 | import * as React from "react"
 4 | import * as LabelPrimitive from "@radix-ui/react-label"
 5 | import { cva, type VariantProps } from "class-variance-authority"
 6 | 
 7 | import { cn } from "@/lib/utils"
 8 | 
 9 | const labelVariants = cva(
10 |   "text-sm font-medium leading-none peer-disabled:cursor-not-allowed peer-disabled:opacity-70"
11 | )
12 | 
13 | const Label = React.forwardRef<
14 |   React.ElementRef<typeof LabelPrimitive.Root>,
15 |   React.ComponentPropsWithoutRef<typeof LabelPrimitive.Root> &
16 |     VariantProps<typeof labelVariants>
17 | >(({ className, ...props }, ref) => (
18 |   <LabelPrimitive.Root
19 |     ref={ref}
20 |     className={cn(labelVariants(), className)}
21 |     {...props}
22 |   />
23 | ))
24 | Label.displayName = LabelPrimitive.Root.displayName
25 | 
26 | export { Label }
27 | 


--------------------------------------------------------------------------------
/client/components/ui/slider.tsx:
--------------------------------------------------------------------------------
 1 | "use client"
 2 | 
 3 | import * as React from "react"
 4 | import * as SliderPrimitive from "@radix-ui/react-slider"
 5 | 
 6 | import { cn } from "@/lib/utils"
 7 | 
 8 | const Slider = React.forwardRef<
 9 |   React.ElementRef<typeof SliderPrimitive.Root>,
10 |   React.ComponentPropsWithoutRef<typeof SliderPrimitive.Root>
11 | >(({ className, ...props }, ref) => (
12 |   <SliderPrimitive.Root
13 |     ref={ref}
14 |     className={cn(
15 |       "relative flex w-full touch-none select-none items-center",
16 |       className
17 |     )}
18 |     {...props}
19 |   >
20 |     <SliderPrimitive.Track className="relative h-1.5 w-full grow overflow-hidden rounded-full bg-primary/20">
21 |       <SliderPrimitive.Range className="absolute h-full bg-primary" />
22 |     </SliderPrimitive.Track>
23 |     <SliderPrimitive.Thumb className="block h-4 w-4 rounded-full border border-primary/50 bg-background shadow transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50" />
24 |   </SliderPrimitive.Root>
25 | ))
26 | Slider.displayName = SliderPrimitive.Root.displayName
27 | 
28 | export { Slider }
29 | 


--------------------------------------------------------------------------------
/client/components/ui/textarea.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react"
 2 | 
 3 | import { cn } from "@/lib/utils"
 4 | 
 5 | export interface TextareaProps
 6 |   extends React.TextareaHTMLAttributes<HTMLTextAreaElement> {}
 7 | 
 8 | const Textarea = React.forwardRef<HTMLTextAreaElement, TextareaProps>(
 9 |   ({ className, ...props }, ref) => {
10 |     return (
11 |       <textarea
12 |         className={cn(
13 |           "flex min-h-[60px] w-full rounded-md border border-input bg-transparent px-3 py-2 text-sm shadow-sm placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50",
14 |           className
15 |         )}
16 |         ref={ref}
17 |         {...props}
18 |       />
19 |     )
20 |   }
21 | )
22 | Textarea.displayName = "Textarea"
23 | 
24 | export { Textarea }
25 | 


--------------------------------------------------------------------------------
/client/components/ui/toast.tsx:
--------------------------------------------------------------------------------
  1 | "use client"
  2 | 
  3 | import * as React from "react"
  4 | import { Cross2Icon } from "@radix-ui/react-icons"
  5 | import * as ToastPrimitives from "@radix-ui/react-toast"
  6 | import { cva, type VariantProps } from "class-variance-authority"
  7 | 
  8 | import { cn } from "@/lib/utils"
  9 | 
 10 | const ToastProvider = ToastPrimitives.Provider
 11 | 
 12 | const ToastViewport = React.forwardRef<
 13 |   React.ElementRef<typeof ToastPrimitives.Viewport>,
 14 |   React.ComponentPropsWithoutRef<typeof ToastPrimitives.Viewport>
 15 | >(({ className, ...props }, ref) => (
 16 |   <ToastPrimitives.Viewport
 17 |     ref={ref}
 18 |     className={cn(
 19 |       "fixed top-0 z-[100] flex max-h-screen w-full flex-col-reverse p-4 sm:bottom-0 sm:right-0 sm:top-auto sm:flex-col md:max-w-[420px]",
 20 |       className
 21 |     )}
 22 |     {...props}
 23 |   />
 24 | ))
 25 | ToastViewport.displayName = ToastPrimitives.Viewport.displayName
 26 | 
 27 | const toastVariants = cva(
 28 |   "group pointer-events-auto relative flex w-full items-center justify-between space-x-2 overflow-hidden rounded-md border p-4 pr-6 shadow-lg transition-all data-[swipe=cancel]:translate-x-0 data-[swipe=end]:translate-x-[var(--radix-toast-swipe-end-x)] data-[swipe=move]:translate-x-[var(--radix-toast-swipe-move-x)] data-[swipe=move]:transition-none data-[state=open]:animate-in data-[state=closed]:animate-out data-[swipe=end]:animate-out data-[state=closed]:fade-out-80 data-[state=closed]:slide-out-to-right-full data-[state=open]:slide-in-from-top-full data-[state=open]:sm:slide-in-from-bottom-full",
 29 |   {
 30 |     variants: {
 31 |       variant: {
 32 |         default: "border bg-background text-foreground",
 33 |         destructive:
 34 |           "destructive group border-destructive bg-destructive text-destructive-foreground",
 35 |       },
 36 |     },
 37 |     defaultVariants: {
 38 |       variant: "default",
 39 |     },
 40 |   }
 41 | )
 42 | 
 43 | const Toast = React.forwardRef<
 44 |   React.ElementRef<typeof ToastPrimitives.Root>,
 45 |   React.ComponentPropsWithoutRef<typeof ToastPrimitives.Root> &
 46 |     VariantProps<typeof toastVariants>
 47 | >(({ className, variant, ...props }, ref) => {
 48 |   return (
 49 |     <ToastPrimitives.Root
 50 |       ref={ref}
 51 |       className={cn(toastVariants({ variant }), className)}
 52 |       {...props}
 53 |     />
 54 |   )
 55 | })
 56 | Toast.displayName = ToastPrimitives.Root.displayName
 57 | 
 58 | const ToastAction = React.forwardRef<
 59 |   React.ElementRef<typeof ToastPrimitives.Action>,
 60 |   React.ComponentPropsWithoutRef<typeof ToastPrimitives.Action>
 61 | >(({ className, ...props }, ref) => (
 62 |   <ToastPrimitives.Action
 63 |     ref={ref}
 64 |     className={cn(
 65 |       "inline-flex h-8 shrink-0 items-center justify-center rounded-md border bg-transparent px-3 text-sm font-medium transition-colors hover:bg-secondary focus:outline-none focus:ring-1 focus:ring-ring disabled:pointer-events-none disabled:opacity-50 group-[.destructive]:border-muted/40 group-[.destructive]:hover:border-destructive/30 group-[.destructive]:hover:bg-destructive group-[.destructive]:hover:text-destructive-foreground group-[.destructive]:focus:ring-destructive",
 66 |       className
 67 |     )}
 68 |     {...props}
 69 |   />
 70 | ))
 71 | ToastAction.displayName = ToastPrimitives.Action.displayName
 72 | 
 73 | const ToastClose = React.forwardRef<
 74 |   React.ElementRef<typeof ToastPrimitives.Close>,
 75 |   React.ComponentPropsWithoutRef<typeof ToastPrimitives.Close>
 76 | >(({ className, ...props }, ref) => (
 77 |   <ToastPrimitives.Close
 78 |     ref={ref}
 79 |     className={cn(
 80 |       "absolute right-1 top-1 rounded-md p-1 text-foreground/50 opacity-0 transition-opacity hover:text-foreground focus:opacity-100 focus:outline-none focus:ring-1 group-hover:opacity-100 group-[.destructive]:text-red-300 group-[.destructive]:hover:text-red-50 group-[.destructive]:focus:ring-red-400 group-[.destructive]:focus:ring-offset-red-600",
 81 |       className
 82 |     )}
 83 |     toast-close=""
 84 |     {...props}
 85 |   >
 86 |     <Cross2Icon className="h-4 w-4" />
 87 |   </ToastPrimitives.Close>
 88 | ))
 89 | ToastClose.displayName = ToastPrimitives.Close.displayName
 90 | 
 91 | const ToastTitle = React.forwardRef<
 92 |   React.ElementRef<typeof ToastPrimitives.Title>,
 93 |   React.ComponentPropsWithoutRef<typeof ToastPrimitives.Title>
 94 | >(({ className, ...props }, ref) => (
 95 |   <ToastPrimitives.Title
 96 |     ref={ref}
 97 |     className={cn("text-sm font-semibold [&+div]:text-xs", className)}
 98 |     {...props}
 99 |   />
100 | ))
101 | ToastTitle.displayName = ToastPrimitives.Title.displayName
102 | 
103 | const ToastDescription = React.forwardRef<
104 |   React.ElementRef<typeof ToastPrimitives.Description>,
105 |   React.ComponentPropsWithoutRef<typeof ToastPrimitives.Description>
106 | >(({ className, ...props }, ref) => (
107 |   <ToastPrimitives.Description
108 |     ref={ref}
109 |     className={cn("text-sm opacity-90", className)}
110 |     {...props}
111 |   />
112 | ))
113 | ToastDescription.displayName = ToastPrimitives.Description.displayName
114 | 
115 | type ToastProps = React.ComponentPropsWithoutRef<typeof Toast>
116 | 
117 | type ToastActionElement = React.ReactElement<typeof ToastAction>
118 | 
119 | export {
120 |   type ToastProps,
121 |   type ToastActionElement,
122 |   ToastProvider,
123 |   ToastViewport,
124 |   Toast,
125 |   ToastTitle,
126 |   ToastDescription,
127 |   ToastClose,
128 |   ToastAction,
129 | }
130 | 


--------------------------------------------------------------------------------
/client/components/ui/toaster.tsx:
--------------------------------------------------------------------------------
 1 | "use client"
 2 | 
 3 | import {
 4 |   Toast,
 5 |   ToastClose,
 6 |   ToastDescription,
 7 |   ToastProvider,
 8 |   ToastTitle,
 9 |   ToastViewport,
10 | } from "@/components/ui/toast"
11 | import { useToast } from "@/components/ui/use-toast"
12 | 
13 | export function Toaster() {
14 |   const { toasts } = useToast()
15 | 
16 |   return (
17 |     <ToastProvider>
18 |       {toasts.map(function ({ id, title, description, action, ...props }) {
19 |         return (
20 |           <Toast key={id} {...props}>
21 |             <div className="grid gap-1">
22 |               {title && <ToastTitle>{title}</ToastTitle>}
23 |               {description && (
24 |                 <ToastDescription>{description}</ToastDescription>
25 |               )}
26 |             </div>
27 |             {action}
28 |             <ToastClose />
29 |           </Toast>
30 |         )
31 |       })}
32 |       <ToastViewport />
33 |     </ToastProvider>
34 |   )
35 | }
36 | 


--------------------------------------------------------------------------------
/client/components/ui/use-toast.ts:
--------------------------------------------------------------------------------
  1 | "use client"
  2 | 
  3 | // Inspired by react-hot-toast library
  4 | import * as React from "react"
  5 | 
  6 | import type {
  7 |   ToastActionElement,
  8 |   ToastProps,
  9 | } from "@/components/ui/toast"
 10 | 
 11 | const TOAST_LIMIT = 1
 12 | const TOAST_REMOVE_DELAY = 1000000
 13 | 
 14 | type ToasterToast = ToastProps & {
 15 |   id: string
 16 |   title?: React.ReactNode
 17 |   description?: React.ReactNode
 18 |   action?: ToastActionElement
 19 | }
 20 | 
 21 | const actionTypes = {
 22 |   ADD_TOAST: "ADD_TOAST",
 23 |   UPDATE_TOAST: "UPDATE_TOAST",
 24 |   DISMISS_TOAST: "DISMISS_TOAST",
 25 |   REMOVE_TOAST: "REMOVE_TOAST",
 26 | } as const
 27 | 
 28 | let count = 0
 29 | 
 30 | function genId() {
 31 |   count = (count + 1) % Number.MAX_SAFE_INTEGER
 32 |   return count.toString()
 33 | }
 34 | 
 35 | type ActionType = typeof actionTypes
 36 | 
 37 | type Action =
 38 |   | {
 39 |       type: ActionType["ADD_TOAST"]
 40 |       toast: ToasterToast
 41 |     }
 42 |   | {
 43 |       type: ActionType["UPDATE_TOAST"]
 44 |       toast: Partial<ToasterToast>
 45 |     }
 46 |   | {
 47 |       type: ActionType["DISMISS_TOAST"]
 48 |       toastId?: ToasterToast["id"]
 49 |     }
 50 |   | {
 51 |       type: ActionType["REMOVE_TOAST"]
 52 |       toastId?: ToasterToast["id"]
 53 |     }
 54 | 
 55 | interface State {
 56 |   toasts: ToasterToast[]
 57 | }
 58 | 
 59 | const toastTimeouts = new Map<string, ReturnType<typeof setTimeout>>()
 60 | 
 61 | const addToRemoveQueue = (toastId: string) => {
 62 |   if (toastTimeouts.has(toastId)) {
 63 |     return
 64 |   }
 65 | 
 66 |   const timeout = setTimeout(() => {
 67 |     toastTimeouts.delete(toastId)
 68 |     dispatch({
 69 |       type: "REMOVE_TOAST",
 70 |       toastId: toastId,
 71 |     })
 72 |   }, TOAST_REMOVE_DELAY)
 73 | 
 74 |   toastTimeouts.set(toastId, timeout)
 75 | }
 76 | 
 77 | export const reducer = (state: State, action: Action): State => {
 78 |   switch (action.type) {
 79 |     case "ADD_TOAST":
 80 |       return {
 81 |         ...state,
 82 |         toasts: [action.toast, ...state.toasts].slice(0, TOAST_LIMIT),
 83 |       }
 84 | 
 85 |     case "UPDATE_TOAST":
 86 |       return {
 87 |         ...state,
 88 |         toasts: state.toasts.map((t) =>
 89 |           t.id === action.toast.id ? { ...t, ...action.toast } : t
 90 |         ),
 91 |       }
 92 | 
 93 |     case "DISMISS_TOAST": {
 94 |       const { toastId } = action
 95 | 
 96 |       // ! Side effects ! - This could be extracted into a dismissToast() action,
 97 |       // but I'll keep it here for simplicity
 98 |       if (toastId) {
 99 |         addToRemoveQueue(toastId)
100 |       } else {
101 |         state.toasts.forEach((toast) => {
102 |           addToRemoveQueue(toast.id)
103 |         })
104 |       }
105 | 
106 |       return {
107 |         ...state,
108 |         toasts: state.toasts.map((t) =>
109 |           t.id === toastId || toastId === undefined
110 |             ? {
111 |                 ...t,
112 |                 open: false,
113 |               }
114 |             : t
115 |         ),
116 |       }
117 |     }
118 |     case "REMOVE_TOAST":
119 |       if (action.toastId === undefined) {
120 |         return {
121 |           ...state,
122 |           toasts: [],
123 |         }
124 |       }
125 |       return {
126 |         ...state,
127 |         toasts: state.toasts.filter((t) => t.id !== action.toastId),
128 |       }
129 |   }
130 | }
131 | 
132 | const listeners: Array<(state: State) => void> = []
133 | 
134 | let memoryState: State = { toasts: [] }
135 | 
136 | function dispatch(action: Action) {
137 |   memoryState = reducer(memoryState, action)
138 |   listeners.forEach((listener) => {
139 |     listener(memoryState)
140 |   })
141 | }
142 | 
143 | type Toast = Omit<ToasterToast, "id">
144 | 
145 | function toast({ ...props }: Toast) {
146 |   const id = genId()
147 | 
148 |   const update = (props: ToasterToast) =>
149 |     dispatch({
150 |       type: "UPDATE_TOAST",
151 |       toast: { ...props, id },
152 |     })
153 |   const dismiss = () => dispatch({ type: "DISMISS_TOAST", toastId: id })
154 | 
155 |   dispatch({
156 |     type: "ADD_TOAST",
157 |     toast: {
158 |       ...props,
159 |       id,
160 |       open: true,
161 |       onOpenChange: (open) => {
162 |         if (!open) dismiss()
163 |       },
164 |     },
165 |   })
166 | 
167 |   return {
168 |     id: id,
169 |     dismiss,
170 |     update,
171 |   }
172 | }
173 | 
174 | function useToast() {
175 |   const [state, setState] = React.useState<State>(memoryState)
176 | 
177 |   React.useEffect(() => {
178 |     listeners.push(setState)
179 |     return () => {
180 |       const index = listeners.indexOf(setState)
181 |       if (index > -1) {
182 |         listeners.splice(index, 1)
183 |       }
184 |     }
185 |   }, [state])
186 | 
187 |   return {
188 |     ...state,
189 |     toast,
190 |     dismiss: (toastId?: string) => dispatch({ type: "DISMISS_TOAST", toastId }),
191 |   }
192 | }
193 | 
194 | export { useToast, toast }
195 | 


--------------------------------------------------------------------------------
/client/lib/supabase.ts:
--------------------------------------------------------------------------------
 1 | import { createServerClient, type CookieOptions } from "@supabase/ssr";
 2 | import { NextResponse, type NextRequest } from "next/server";
 3 | import { cookies } from "next/headers";
 4 | 
 5 | export async function updateSession(request: NextRequest) {
 6 |   let response = NextResponse.next({
 7 |     request: {
 8 |       headers: request.headers,
 9 |     },
10 |   });
11 | 
12 |   const supabase = createServerClient(
13 |     process.env.NEXT_PUBLIC_SUPABASE_URL!,
14 |     process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY!,
15 |     {
16 |       cookies: {
17 |         get(name: string) {
18 |           return request.cookies.get(name)?.value;
19 |         },
20 |         set(name: string, value: string, options: CookieOptions) {
21 |           request.cookies.set({
22 |             name,
23 |             value,
24 |             ...options,
25 |           });
26 |           response = NextResponse.next({
27 |             request: {
28 |               headers: request.headers,
29 |             },
30 |           });
31 |           response.cookies.set({
32 |             name,
33 |             value,
34 |             ...options,
35 |           });
36 |         },
37 |         remove(name: string, options: CookieOptions) {
38 |           request.cookies.set({
39 |             name,
40 |             value: "",
41 |             ...options,
42 |           });
43 |           response = NextResponse.next({
44 |             request: {
45 |               headers: request.headers,
46 |             },
47 |           });
48 |           response.cookies.set({
49 |             name,
50 |             value: "",
51 |             ...options,
52 |           });
53 |         },
54 |       },
55 |     }
56 |   );
57 | 
58 |   await supabase.auth.getUser();
59 | 
60 |   return response;
61 | }
62 | export function createClient() {
63 |   const cookieStore = cookies();
64 | 
65 |   return createServerClient(
66 |     process.env.NEXT_PUBLIC_SUPABASE_URL!,
67 |     process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY!,
68 |     {
69 |       cookies: {
70 |         get(name: string) {
71 |           return cookieStore.get(name)?.value;
72 |         },
73 |         set(name: string, value: string, options: CookieOptions) {
74 |           try {
75 |             cookieStore.set({ name, value, ...options });
76 |           } catch (error) {
77 |             // The `set` method was called from a Server Component.
78 |             // This can be ignored if you have middleware refreshing
79 |             // user sessions.
80 |           }
81 |         },
82 |         remove(name: string, options: CookieOptions) {
83 |           try {
84 |             cookieStore.set({ name, value: "", ...options });
85 |           } catch (error) {
86 |             // The `delete` method was called from a Server Component.
87 |             // This can be ignored if you have middleware refreshing
88 |             // user sessions.
89 |           }
90 |         },
91 |       },
92 |     }
93 |   );
94 | }
95 | 


--------------------------------------------------------------------------------
/client/lib/utils.ts:
--------------------------------------------------------------------------------
 1 | import { type ClassValue, clsx } from "clsx";
 2 | import { twMerge } from "tailwind-merge";
 3 | import { createBrowserClient } from "@supabase/ssr";
 4 | 
 5 | export function cn(...inputs: ClassValue[]) {
 6 |   return twMerge(clsx(inputs));
 7 | }
 8 | 
 9 | export function createClient() {
10 |   return createBrowserClient(
11 |     process.env.NEXT_PUBLIC_SUPABASE_URL!,
12 |     process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY!
13 |   );
14 | }
15 | 


--------------------------------------------------------------------------------
/client/next.config.mjs:
--------------------------------------------------------------------------------
1 | /** @type {import('next').NextConfig} */
2 | const nextConfig = {};
3 | 
4 | export default nextConfig;
5 | 


--------------------------------------------------------------------------------
/client/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "client",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "dev": "next dev",
 7 |     "build": "next build",
 8 |     "start": "next start",
 9 |     "lint": "next lint"
10 |   },
11 |   "dependencies": {
12 |     "@apollo/client": "^3.10.4",
13 |     "@mantine/core": "^7.9.2",
14 |     "@mantine/dropzone": "^7.9.2",
15 |     "@mantine/hooks": "^7.9.2",
16 |     "@radix-ui/react-dialog": "^1.0.5",
17 |     "@radix-ui/react-dropdown-menu": "^2.0.6",
18 |     "@radix-ui/react-icons": "^1.3.0",
19 |     "@radix-ui/react-label": "^2.0.2",
20 |     "@radix-ui/react-slider": "^1.1.2",
21 |     "@radix-ui/react-slot": "^1.0.2",
22 |     "@radix-ui/react-toast": "^1.1.5",
23 |     "@supabase/ssr": "^0.3.0",
24 |     "@supabase/supabase-js": "^2.43.2",
25 |     "axios": "^1.6.8",
26 |     "class-variance-authority": "^0.7.0",
27 |     "clsx": "^2.1.1",
28 |     "graphql": "^16.8.1",
29 |     "next": "14.2.3",
30 |     "react": "^18",
31 |     "react-dom": "^18",
32 |     "react-icons": "^5.2.1",
33 |     "tailwind-merge": "^2.3.0",
34 |     "tailwindcss-animate": "^1.0.7",
35 |     "uuid": "^9.0.1"
36 |   },
37 |   "devDependencies": {
38 |     "@types/node": "^20",
39 |     "@types/react": "^18",
40 |     "@types/react-dom": "^18",
41 |     "@types/uuid": "^9.0.8",
42 |     "eslint": "^8",
43 |     "eslint-config-next": "14.2.3",
44 |     "postcss": "^8",
45 |     "postcss-preset-mantine": "^1.15.0",
46 |     "tailwindcss": "^3.4.1",
47 |     "typescript": "^5"
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/client/postcss.config.mjs:
--------------------------------------------------------------------------------
 1 | /** @type {import('postcss-load-config').Config} */
 2 | const config = {
 3 |   plugins: {
 4 |     tailwindcss: {},
 5 |     "postcss-preset-mantine": {},
 6 |     "postcss-simple-vars": {
 7 |       variables: {
 8 |         "mantine-breakpoint-xs": "36em",
 9 |         "mantine-breakpoint-sm": "48em",
10 |         "mantine-breakpoint-md": "62em",
11 |         "mantine-breakpoint-lg": "75em",
12 |         "mantine-breakpoint-xl": "88em",
13 |       },
14 |     },
15 |   },
16 | };
17 | 
18 | export default config;
19 | 


--------------------------------------------------------------------------------
/client/public/fsdp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/client/public/fsdp.jpg


--------------------------------------------------------------------------------
/client/tailwind.config.ts:
--------------------------------------------------------------------------------
 1 | import type { Config } from "tailwindcss"
 2 | 
 3 | const config = {
 4 |   darkMode: ["class"],
 5 |   content: [
 6 |     './pages/**/*.{ts,tsx}',
 7 |     './components/**/*.{ts,tsx}',
 8 |     './app/**/*.{ts,tsx}',
 9 |     './src/**/*.{ts,tsx}',
10 | 	],
11 |   prefix: "",
12 |   theme: {
13 |     container: {
14 |       center: true,
15 |       padding: "2rem",
16 |       screens: {
17 |         "2xl": "1400px",
18 |       },
19 |     },
20 |     extend: {
21 |       colors: {
22 |         border: "hsl(var(--border))",
23 |         input: "hsl(var(--input))",
24 |         ring: "hsl(var(--ring))",
25 |         background: "hsl(var(--background))",
26 |         foreground: "hsl(var(--foreground))",
27 |         primary: {
28 |           DEFAULT: "hsl(var(--primary))",
29 |           foreground: "hsl(var(--primary-foreground))",
30 |         },
31 |         secondary: {
32 |           DEFAULT: "hsl(var(--secondary))",
33 |           foreground: "hsl(var(--secondary-foreground))",
34 |         },
35 |         destructive: {
36 |           DEFAULT: "hsl(var(--destructive))",
37 |           foreground: "hsl(var(--destructive-foreground))",
38 |         },
39 |         muted: {
40 |           DEFAULT: "hsl(var(--muted))",
41 |           foreground: "hsl(var(--muted-foreground))",
42 |         },
43 |         accent: {
44 |           DEFAULT: "hsl(var(--accent))",
45 |           foreground: "hsl(var(--accent-foreground))",
46 |         },
47 |         popover: {
48 |           DEFAULT: "hsl(var(--popover))",
49 |           foreground: "hsl(var(--popover-foreground))",
50 |         },
51 |         card: {
52 |           DEFAULT: "hsl(var(--card))",
53 |           foreground: "hsl(var(--card-foreground))",
54 |         },
55 |       },
56 |       borderRadius: {
57 |         lg: "var(--radius)",
58 |         md: "calc(var(--radius) - 2px)",
59 |         sm: "calc(var(--radius) - 4px)",
60 |       },
61 |       keyframes: {
62 |         "accordion-down": {
63 |           from: { height: "0" },
64 |           to: { height: "var(--radix-accordion-content-height)" },
65 |         },
66 |         "accordion-up": {
67 |           from: { height: "var(--radix-accordion-content-height)" },
68 |           to: { height: "0" },
69 |         },
70 |       },
71 |       animation: {
72 |         "accordion-down": "accordion-down 0.2s ease-out",
73 |         "accordion-up": "accordion-up 0.2s ease-out",
74 |       },
75 |     },
76 |   },
77 |   plugins: [require("tailwindcss-animate")],
78 | } satisfies Config
79 | 
80 | export default config


--------------------------------------------------------------------------------
/client/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "lib": ["dom", "dom.iterable", "esnext"],
 4 |     "allowJs": true,
 5 |     "skipLibCheck": true,
 6 |     "strict": true,
 7 |     "noEmit": true,
 8 |     "esModuleInterop": true,
 9 |     "module": "esnext",
10 |     "moduleResolution": "bundler",
11 |     "resolveJsonModule": true,
12 |     "isolatedModules": true,
13 |     "jsx": "preserve",
14 |     "incremental": true,
15 |     "plugins": [
16 |       {
17 |         "name": "next"
18 |       }
19 |     ],
20 |     "paths": {
21 |       "@/*": ["./*"]
22 |     }
23 |   },
24 |   "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
25 |   "exclude": ["node_modules"]
26 | }
27 | 


--------------------------------------------------------------------------------
/client/utils/getPod.ts:
--------------------------------------------------------------------------------
 1 | import { gql } from '@apollo/client';
 2 | 
 3 | export const GET_POD = gql`
 4 |   fragment GpuFragment on GPU {
 5 |     id
 6 |     type
 7 |     utilization
 8 |   }
 9 | 
10 |   fragment PodRegistryFragment on Registry {
11 |     id
12 |     url
13 |   }
14 | 
15 |   fragment PodRuntimeFragment on Runtime {
16 |     id
17 |     type
18 |     version
19 |   }
20 | 
21 |   query pod($input: PodFilter) {
22 |     pod(input: $input) {
23 |       lowestBidPriceToResume
24 |       aiApiId
25 |       apiKey
26 |       consumerUserId
27 |       gpus {
28 |         ...GpuFragment
29 |       }
30 |       registry {
31 |         ...PodRegistryFragment
32 |       }
33 |       runtime {
34 |         ...PodRuntimeFragment
35 |       }
36 |     }
37 |   }
38 | `;


--------------------------------------------------------------------------------
/client/utils/getPods.ts:
--------------------------------------------------------------------------------
 1 | import { gql } from "@apollo/client";
 2 | 
 3 | export const GET_ALL_PODS = gql`
 4 |   query Pods {
 5 |     myself {
 6 |       pods {
 7 |         id
 8 |         name
 9 |         runtime {
10 |           uptimeInSeconds
11 |           ports {
12 |             ip
13 |             isIpPublic
14 |             privatePort
15 |             publicPort
16 |             type
17 |           }
18 |           gpus {
19 |             id
20 |             gpuUtilPercent
21 |             memoryUtilPercent
22 |           }
23 |           container {
24 |             cpuPercent
25 |             memoryPercent
26 |           }
27 |         }
28 |       }
29 |     }
30 |   }
31 | `;
32 | 


--------------------------------------------------------------------------------
/client/utils/rentPod.ts:
--------------------------------------------------------------------------------
 1 | import { gql } from "@apollo/client";
 2 | 
 3 | // RTX A6000, 0.55, 1 GPU, 48 GB,  
 4 | 
 5 | export const POD_RENT_MUTATION = gql`
 6 |   mutation podRentInterruptable($input: PodRentInterruptableInput!) {
 7 |     podRentInterruptable(input: $input) {
 8 |       id
 9 |     }
10 |   }
11 | `;
12 | 


--------------------------------------------------------------------------------
/client/utils/stopPod.ts:
--------------------------------------------------------------------------------
 1 | import { gql } from "@apollo/client";
 2 | 
 3 | export const STOP_POD = gql`
 4 |   mutation podStop($input: PodStopInput!) {
 5 |     podStop(input: $input) {
 6 |       id
 7 |     }
 8 |   }
 9 | `;
10 | 


--------------------------------------------------------------------------------
/fault_tolerance/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/lambda/python:3.10
 2 | 
 3 | COPY requirements.txt ./
 4 | 
 5 | RUN pip3 install -r ./requirements.txt
 6 | 
 7 | COPY ping.py ./
 8 | 
 9 | COPY restarts.py ./
10 | 
11 | CMD [ "ping.lambda_handler" ]
12 | 


--------------------------------------------------------------------------------
/fault_tolerance/ping.py:
--------------------------------------------------------------------------------
 1 | import runpod
 2 | import os
 3 | import dotenv
 4 | from restarts import trigger_restart
 5 | import requests
 6 | 
 7 | DESIRED_PODS = 4
 8 | 
 9 | dotenv.load_dotenv()
10 | 
11 | API_KEY = os.getenv("RUNPOD_API_KEY")
12 | runpod.api_key = API_KEY
13 | endpoints = runpod.get_endpoints()
14 | 
15 | url = f"https://api.runpod.io/graphql?api_key={API_KEY}"
16 | headers = {"Content-Type": "application/json"}
17 | data = {
18 |     "query": "query Pods { myself { pods { id name runtime { uptimeInSeconds ports { ip isIpPublic privatePort publicPort type } gpus { id gpuUtilPercent memoryUtilPercent } container { cpuPercent memoryPercent } } } } }"
19 | }
20 | 
21 | 
22 | def lambda_handler(event, context):
23 |     response = requests.post(url, headers=headers, json=data)
24 |     result = response.json()
25 |     pods = result["data"]["myself"]["pods"]
26 | 
27 |     # num active pods
28 |     num_active_pods = len(pods)
29 |     print("ACTIVE PODS", num_active_pods)
30 | 
31 |     if num_active_pods < DESIRED_PODS:
32 |         new_pods = trigger_restart(pods, DESIRED_PODS, API_KEY)
33 | 


--------------------------------------------------------------------------------
/fault_tolerance/requirements.txt:
--------------------------------------------------------------------------------
1 | runpod==1.6.2
2 | requests==2.31.0
3 | 


--------------------------------------------------------------------------------
/fault_tolerance/restarts.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | 
 4 | def trigger_restart(pods, desired_pods, API_KEY):
 5 |     url = f'https://api.runpod.io/graphql?api_key={API_KEY}'
 6 | 
 7 |     headers = {
 8 |         'Content-Type': 'application/json'
 9 |     }
10 | 
11 |     TERMINATE_POD = """
12 |     mutation terminatePod($input: PodTerminateInput!) {
13 |         podTerminate(input: $input)
14 |     }
15 |     """
16 | 
17 |     CREATE_POD = """
18 |     mutation podRentInterruptable($input: PodRentInterruptableInput!) {
19 |         podRentInterruptable(input: $input) {
20 |             id
21 |             machineId
22 |             __typename
23 |         }
24 |     }
25 |     """
26 |     
27 |     print("Triggering restart")
28 | 
29 |     print("Pods: ", pods)
30 | 
31 |     for pod in pods:
32 |         print("Terminating pod: ", pod['id'])
33 | 
34 |         terminate_variables = {
35 |             "input": {
36 |                 "podId": pod['id']
37 |             }
38 |         }
39 | 
40 |         terminate_payload = {
41 |             "query": TERMINATE_POD,
42 |             "variables": terminate_variables
43 |         }
44 | 
45 |         response = requests.post(url, headers=headers, json=terminate_payload)
46 |         result = response.json()
47 | 
48 |         print("Result: ", result)
49 |     
50 |     print("Termination complete")
51 | 
52 |     print("Restarting Pods...")
53 | 
54 |     new_pods = []
55 | 
56 |     for i in range(desired_pods):
57 |         print("Restarting pod: ", i)
58 |         
59 |         create_variables = {
60 |             "input": {
61 |                 "bidPerGpu": 0.55,
62 |                 "cloudType": "SECURE",
63 |                 "containerDiskInGb": 20,
64 |                 "volumeInGb": 0,
65 |                 "gpuCount": 1,
66 |                 "gpuTypeId": "NVIDIA A40",
67 |                 "minMemoryInGb": 48,
68 |                 "minVcpuCount": 9,
69 |                 "networkVolumeId": "b6w3w794gy",
70 |                 "startJupyter": True,
71 |                 "startSsh": True,
72 |                 "templateId": "runpod-torch",
73 |                 "volumeKey": "null",
74 |                 "ports": "8888/http,22/tcp",
75 |                 "dataCenterId": "CA-MTL-1",
76 |             }
77 |         }
78 | 
79 |         create_payload = {
80 |             "query": CREATE_POD,
81 |             "variables": create_variables
82 |         }
83 | 
84 |         response = requests.post(url, headers=headers, json=create_payload)
85 |         result = response.json()
86 | 
87 |         new_pods.append(result['data']['podRentInterruptable'])
88 |     
89 |     print("New pods: ", new_pods)
90 |     print("Restart complete")
91 | 
92 |     return new_pods
93 | 


--------------------------------------------------------------------------------
/fsdp/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM runpod/pytorch:2.0.1-py3.10-cuda11.8.0-devel-ubuntu22.04
 2 | 
 3 | ENV WANDB_API_KEY_FILE=/run/secrets/wandb_api_key
 4 | 
 5 | WORKDIR /workspace
 6 | 
 7 | RUN git clone https://github.com/AnswerDotAI/fsdp_qlora
 8 | 
 9 | WORKDIR /workspace/fsdp_qlora
10 | 
11 | RUN pip install llama-recipes fastcore "transformers!=4.38.*,!=4.39.*" --extra-index-url https://download.pytorch.org/whl/test/cu118 && \
12 |     pip install "bitsandbytes>=0.43.0" && \
13 |     pip install wandb
14 | 
15 | RUN export WANDB_API_KEY=$(cat $WANDB_API_KEY_FILE) && \
16 |     echo "$(cat /run/secrets/huggingface_api_key)" | huggingface-cli login
17 | 
18 | CMD ["python", "train.py", "--model_name", "meta-llama/Meta-Llama-Guard-2-8B", "--batch_size", "2", "--context_length", "512", "--precision", "bf16", "--train_type", "qlora", "--use_gradient_checkpointing", "true", "--use_cpu_offload", "true", "--dataset", "alpaca_sample", "--reentrant_checkpointing", "true", "--log_to", "wandb", "--project_name", "fsdp"]
19 | 


--------------------------------------------------------------------------------
/fsdp/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.1'
 2 | 
 3 | services:
 4 |   fsdp_qlora:
 5 |     image: fsdp_qlora_image
 6 |     secrets:
 7 |       - wandb_api_key
 8 |       - huggingface_api_key
 9 | 
10 | secrets:
11 |   wandb_api_key:
12 |     external: true
13 |   huggingface_api_key:
14 |     external: true
15 | 


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/.gitignore:
--------------------------------------------------------------------------------
 1 | wandb
 2 | output
 3 | .ipynb_checkpoints
 4 | nbs/profile_snapshots
 5 | **/*checkpoint.ipynb
 6 | **/*.log
 7 | nbs/tmp-*
 8 | data/
 9 | sbatch_outputs/
10 | 
11 | # python ignores
12 | __pycache__/
13 | *.pyc
14 | *.pyo
15 | *.pyd


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/benchmarking/large_gpu_benchmarking.sh:
--------------------------------------------------------------------------------
 1 | # 4 x A6000 (48GB), 128 CPUs 472GB CPU RAM
 2 | python train.py --batch_size 32 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type lora
 3 | python train.py --batch_size 16 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
 4 | python train.py --batch_size 16 --use_ddp True --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
 5 | # CPU offloading is not needed for this setup.
 6 | python train.py --batch_size 32 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
 7 | # CPU offloading is not needed for this setup.
 8 | python train.py --batch_size 32 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
 9 | python train.py --batch_size 4 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type lora
10 | python train.py --batch_size 6 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
11 | python train.py --batch_size 4 --use_ddp True --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
12 | # Ignore now, slow.
13 | python train.py --batch_size 128 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
14 | # Ignore now, slow.
15 | python train.py --batch_size 128 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora
16 | 
17 | 
18 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --train_type lora
19 | python train.py --batch_size 10 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
20 | python train.py --batch_size 2 --use_ddp True --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
21 | # python train.py --batch_size 4 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
22 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
23 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --train_type lora
24 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
25 | # python train.py --batch_size 128 --use_ddp True --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
26 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
27 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/benchmarking/small_gpu_benchmarking.sh:
--------------------------------------------------------------------------------
 1 | # Run for 1 fwd-bwd step to find the max bs using a 2xA5000 (24GB each) and 16 CPUs with 88GB RAM machine.
 2 | # https://github.com/AnswerDotAI/fsdp_qlora/blob/299f51a98246d77f5e556fe1a27ab29e107530f0/train.py
 3 | # Uses different default params for train.py script to reduce clutter in the commands below. 
 4 | 
 5 | # Notes:
 6 | # 1) LORA CPU offloading with model sizes larger than 7B fails, most probably due to limited CPU memory. 
 7 | # QLORA CPU offloading works fine.
 8 | # 2) CPU offloading appears exteremely slow. Getting the actual run times will be useful. 
 9 | # Check PCIe stuff. 
10 | 
11 | # Fine.
12 | python train.py --batch_size 48 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing True --train_type lora
13 | # Check why bs is very low in qlora vs lora? Activation overhead?
14 | python train.py --batch_size 24 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
15 | # How is qlora full shard same as ddp?
16 | python train.py --batch_size 24 --use_ddp True --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
17 | # Extremely slow.
18 | python train.py --batch_size 96 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
19 | # Same as before -> Check why bs is very low in qlora vs lora? Activation overhead with large bs?
20 | python train.py --batch_size 30 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
21 | # Fine.
22 | python train.py --batch_size 4 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing False --train_type lora
23 | # Now that bs drops (or activations) we can use larger bs than lora.
24 | python train.py --batch_size 6 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
25 | # Again -> How is qlora full shard bs same as ddp?
26 | python train.py --batch_size 6 --use_ddp True --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
27 | # Again -> Extremely slow also much lower bs than grad ckpt. Probably smart to prefer grad ckpt over cpu offload.
28 | python train.py --batch_size 8 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
29 | # Interesting now lora and qlora have same bs when grad ckpt disabled with cpu offloading.
30 | python train.py --batch_size 8 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora
31 | 
32 | # Fine.
33 | python train.py --batch_size 22 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing True --train_type lora
34 | # Fine.
35 | python train.py --batch_size 16 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
36 | # 13B -> ~13GB for model. DDP works fine.
37 | python train.py --batch_size 15 --use_ddp True --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
38 | # FIXME: torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with signal SIGKILl. Needs more CPU memory than 88GB?
39 | # Reducing batch size to 1 doesn't fix it, how come storing 13b params (~26GB) need more than 88GB?
40 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
41 | # Qlora cpu offloading works as opposed to lora, likely due to smaller model size after quantization?
42 | python train.py --batch_size 18 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
43 | # Fine.
44 | python train.py --batch_size 2 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing False --train_type lora
45 | # Fine.
46 | python train.py --batch_size 4 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
47 | # Fine.
48 | python train.py --batch_size 3 --use_ddp True --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
49 | # FIXME: torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with signal SIGKILl. Needs more CPU memory than 88GB?
50 | # Reducing batch size to 1 doesn't fix it, how come storing 13b params (~26GB) need more than 88GB?
51 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
52 | # Qlora cpu offloading works as opposed to lora, likely due to smaller model size after quantization?
53 | python train.py --batch_size 4 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora
54 | 
55 | 
56 | # # Test low memory
57 | # python train.py --batch_size 1 --model_name meta-llama/Llama-2-7b-hf --context_length 16 --use_gradient_checkpointing True --train_type qlora --low_memory True
58 | # # This works now. Custom QLORA nn.module, no changes needed in bnb.
59 | # python train.py --batch_size 1 --model_name meta-llama/Llama-2-70b-hf --context_length 1 --use_gradient_checkpointing True --train_type qlora --low_memory True
60 | 
61 | # This is theoretically not possible:
62 | python train.py --batch_size 128 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type lora
63 | # Fine.
64 | python train.py --batch_size 6 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
65 | # OOM during training.
66 | python train.py --batch_size 1 --use_ddp True --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
67 | # FIXME: torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with signal SIGKILl. Needs more CPU memory than 88GB?
68 | python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
69 | # Qlora cpu offloading works as opposed to lora, likely due to smaller model size after quantization?
70 | python train.py --batch_size 10 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
71 | # Not possible theoretically, requires at least 68gb/2=34gb per gpu.
72 | python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type lora
73 | # OOM. There is some extra memory in gpu:0.
74 | python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
75 | # Fine with custom qlora.
76 | python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type custom_qlora
77 | # OOM.
78 | python train.py --batch_size 1 --use_ddp True --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
79 | 
80 | python train.py --batch_size 1 --use_ddp True --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type custom_qlora
81 | # Better. Symmetrically distributed memory across gpus.
82 | # python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type custom_qlora
83 | # OOM on cpu. low memory needs to be fixed for lora model loading?
84 | python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
85 | python train.py --batch_size 2 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora
86 | 
87 | 
88 | # Try with 4xA5000 GPUs with 96 CPUs and 500GB RAM. Model fits in mem. but can't train with seqlen=256.
89 | python train.py --batch_size 2 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
90 | # OOM on cpu. low memory needs to be fixed for lora model loading?
91 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
92 | python train.py --batch_size 4 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
93 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
94 | # OOM on cpu.
95 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
96 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/fsdp_multi_node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=answerai
 3 | #SBATCH --partition=a40x
 4 | #SBATCH --nodes=1
 5 | #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 6 | #SBATCH --gpus-per-node=4
 7 | #SBATCH --mem=256gb
 8 | #SBATCH --cpus-per-gpu=12
 9 | #SBATCH --job-name=fsdp-multi-node-test
10 | #SBATCH --output=sbatch_outputs/%x_%j.out
11 | 
12 | ##### Number of total processes 
13 | echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
14 | echo "Nodelist:= " $SLURM_JOB_NODELIST
15 | echo "Number of nodes:= " $SLURM_JOB_NUM_NODES
16 | echo "Ntasks per node:= "  $SLURM_NTASKS_PER_NODE
17 | echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
18 | 
19 | export MASTER_PORT=12340
20 | export WORLD_SIZE=$(($SLURM_JOB_NUM_NODES * $SLURM_GPUS_PER_NODE))
21 | 
22 | ### get the first node name as master address - customized for vgg slurm
23 | ### e.g. master(gnodee[2-5],gnoded1) == gnodee2
24 | echo "NODELIST="${SLURM_NODELIST}
25 | master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
26 | export MASTER_ADDR=$master_addr
27 | echo "MASTER_ADDR="$MASTER_ADDR
28 | 
29 | echo "Starting python script"
30 | 
31 | 
32 | # run setup script to init environment
33 | module load cuda/11.8
34 | 
35 | SHARED_VOLUME_DIR=/weka/home-$(whoami)
36 | source $SHARED_VOLUME_DIR/py_venvs/fsdp-qlora-py311/bin/activate
37 | 
38 | # nccl
39 | export FI_EFA_FORK_SAFE=1
40 | export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
41 | export FI_EFA_ENABLE_SHM_TRANSFER=0
42 | export OMPI_MCA_mtl_base_verbose=1
43 | export FI_PROVIDER=efa
44 | export NCCL_TREE_THRESHOLD=0
45 | 
46 | export NCCL_DEBUG=ERROR
47 | export NCCL_SOCKET_TIMEOUT=600000 # Set the timeout to 10 minutes (60000 milliseconds)
48 | export NCCL_DEBUG_SUBSYS=ALL
49 | export TORCH_DISTRIBUTED_DEBUG=INFO
50 | 
51 | export NCCL_IBEXT_DISABLE=1
52 | export NCCL_SOCKET_IFNAME=^docker0,lo
53 | 
54 | export OMPI_MCA_mtl_base_verbose=1
55 | export OMPI_MCA_btl="^openib"
56 | echo "Using python from $(which python)"
57 | echo "Using torch from $(python -c 'import torch; print(torch.__file__)')"
58 | echo "Using torch cuda from $(python -c 'import torch; print(torch.version.cuda)')"
59 | echo "Using nccl from $(python -c 'import torch; print(torch.cuda.nccl.version())')"
60 | 
61 | # print cuda home
62 | echo "CUDA_HOME=$CUDA_HOME"
63 | 
64 | # GLOBAL_BATCH_SIZE=64
65 | MAX_BATCH_SIZE=8
66 | GRAD_ACCUM_STEPS=1
67 | 
68 | srun python $SHARED_VOLUME_DIR/git/fsdp_qlora/train.py \
69 | --world_size=$WORLD_SIZE \
70 | --master_addr=$MASTER_ADDR \
71 | --master_port=$MASTER_PORT \
72 | --model_name meta-llama/Llama-2-7b-hf \
73 | --dataset dummy \
74 | --batch_size $MAX_BATCH_SIZE \
75 | --context_length 512 \
76 | --gradient_accumulation_steps $GRAD_ACCUM_STEPS \
77 | --train_type custom_qlora \
78 | --use_gradient_checkpointing True \
79 | --use_activation_cpu_offload True \
80 | --use_cpu_offload False \
81 | --log_to stdout \
82 | --verbose true


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/hf_train.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | import torch, os
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
 4 | from peft import LoraConfig
 5 | from trl import SFTTrainer
 6 | from transformers import TrainingArguments
 7 | 
 8 | local_rank = os.getenv("LOCAL_RANK")
 9 | device_string = "cuda:" + str(local_rank)
10 | 
11 | # Load the dataset
12 | dataset_name = "timdettmers/openassistant-guanaco"
13 | dataset = load_dataset(dataset_name, split="train")
14 | 
15 | 
16 | # Load the model + tokenizer
17 | model_name = "meta-llama/Llama-2-7b-hf"
18 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
19 | tokenizer.pad_token = tokenizer.eos_token
20 | bnb_config = BitsAndBytesConfig(
21 |     load_in_4bit=True,
22 |     bnb_4bit_quant_type="nf4",
23 |     bnb_4bit_compute_dtype=torch.float16,
24 | )
25 | model = AutoModelForCausalLM.from_pretrained(
26 |     model_name,
27 |     quantization_config=bnb_config,
28 |     trust_remote_code=True,
29 |     use_cache = False,
30 |     device_map={'':device_string}
31 | )
32 | 
33 | # PEFT config
34 | lora_alpha = 16
35 | lora_dropout = 0.1
36 | lora_r = 64
37 | peft_config = LoraConfig(
38 |     lora_alpha=lora_alpha,
39 |     lora_dropout=lora_dropout,
40 |     r=lora_r,
41 |     bias="none",
42 |     task_type="CAUSAL_LM",
43 |     target_modules=["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"]
44 | )
45 | 
46 | 
47 | # Args 
48 | max_seq_length = 512
49 | output_dir = "./results"
50 | per_device_train_batch_size = 4
51 | gradient_accumulation_steps = 4
52 | optim = "adamw_hf"
53 | save_steps = 10
54 | logging_steps = 1
55 | learning_rate = 2e-4
56 | max_grad_norm = 0.3
57 | max_steps = 311 # Approx the size of guanaco at bs 8, ga 2, 2 GPUs. 
58 | warmup_ratio = 0.1
59 | lr_scheduler_type = "cosine"
60 | training_arguments = TrainingArguments(
61 |     output_dir=output_dir,
62 |     per_device_train_batch_size=per_device_train_batch_size,
63 |     gradient_accumulation_steps=gradient_accumulation_steps,
64 |     optim=optim,
65 |     save_steps=save_steps,
66 |     logging_steps=logging_steps,
67 |     learning_rate=learning_rate,
68 |     fp16=True,
69 |     max_grad_norm=max_grad_norm,
70 |     max_steps=max_steps,
71 |     warmup_ratio=warmup_ratio,
72 |     group_by_length=False, # Otherwise weird loss pattern (see https://github.com/artidoro/qlora/issues/84#issuecomment-1572408347, https://github.com/artidoro/qlora/issues/228, https://wandb.ai/answerdotai/fsdp_qlora/runs/snhj0eyh)
73 |     lr_scheduler_type=lr_scheduler_type,
74 |     gradient_checkpointing=True,
75 |     gradient_checkpointing_kwargs={'use_reentrant':False}, # Needed for DDP
76 |     report_to="wandb",
77 | )
78 | 
79 | # Trainer 
80 | trainer = SFTTrainer(
81 |     model=model,
82 |     train_dataset=dataset,
83 |     peft_config=peft_config,
84 |     dataset_text_field="text",
85 |     max_seq_length=max_seq_length,
86 |     tokenizer=tokenizer,
87 |     args=training_arguments,
88 | )
89 | 
90 | # Not sure if needed but noticed this in https://colab.research.google.com/drive/1t3exfAVLQo4oKIopQT1SKxK4UcYg7rC1#scrollTo=7OyIvEx7b1GT
91 | for name, module in trainer.model.named_modules():
92 |     if "norm" in name:
93 |         module = module.to(torch.float32)
94 | 
95 | # Train :)
96 | trainer.train()


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/nbs/02-qlora-memeff-loading.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "56c7c28c-651d-40d9-9989-84d5e0acd6c1",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stderr",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "/home/paperspace/git/bitsandbytes/bitsandbytes/cuda_setup/main.py:109: UserWarning: \n",
 14 |       "\n",
 15 |       "================================================================================\n",
 16 |       "WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n",
 17 |       "BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n",
 18 |       "If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n",
 19 |       "If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n",
 20 |       "For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n",
 21 |       "Loading CUDA version: BNB_CUDA_VERSION=123\n",
 22 |       "================================================================================\n",
 23 |       "\n",
 24 |       "\n",
 25 |       "  warn((f'\\n\\n{\"=\"*80}\\n'\n"
 26 |      ]
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "import torch\n",
 31 |     "import bitsandbytes as bnb\n",
 32 |     "import safetensors\n",
 33 |     "from safetensors.torch import save_file"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "id": "60ce68e7-7a82-4bce-a6f5-c1222ac9595a",
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "from bitsandbytes.nn import Linear4bit, Params4bit\n",
 44 |     "import bitsandbytes.functional as F\n",
 45 |     "from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "id": "f7c0725d-dbda-455d-a02c-32b6f489229d",
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "from transformers import AutoConfig, AutoModelForCausalLM\n",
 56 |     "import torch.nn as nn"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "id": "09c83bfd-bc5f-4e6c-8ad9-c621891edcfa",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "### Test Linear4bit Memory Eff Loading\n",
 65 |     "\n",
 66 |     "This will test that each rank has the correct quant state and params, also compare with original weights loaded. "
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "id": "057decc8-852c-4c5c-b3b5-40cd8c634b01",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "params_rank0 = torch.load(\"../data/summoned_lora_layer0_q_proj_base_layer_params_rank0.pt\")\n",
 77 |     "params_rank1 = torch.load(\"../data/summoned_lora_layer0_q_proj_base_layer_params_rank1.pt\")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "98fa6cc2-fcee-4740-b39a-8ea00796e43d",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "quant_state_rank0 = torch.load(\"../data/summoned_lora_layer0_q_proj_quant_state_rank0.pt\", map_location=\"cpu\")\n",
 88 |     "quant_state_rank1 = torch.load(\"../data/summoned_lora_layer0_q_proj_quant_state_rank1.pt\",  map_location=\"cpu\")"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "id": "7d0e1075-dc77-4d6c-8f82-6b27af267894",
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# check gathered quantized weights are same in each rank\n",
 99 |     "for p1, p2 in zip(params_rank0, params_rank1):\n",
100 |     "    p1 = p1[~p1.data.isnan()]\n",
101 |     "    p2 = p2[~p2.data.isnan()]\n",
102 |     "    assert torch.allclose(p1, p2)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "5d639c9f-0c99-4bed-9773-770a04fd260b",
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "quant_type\n",
116 |       "absmax\n",
117 |       "blocksize\n",
118 |       "quant_map\n",
119 |       "dtype\n",
120 |       "shape\n",
121 |       "nested_absmax\n",
122 |       "nested_blocksize\n",
123 |       "nested_quant_map\n",
124 |       "nested_dtype\n",
125 |       "nested_offset\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "# check quant states are same in each rank\n",
131 |     "for k,v in quant_state_rank0.as_dict().items():\n",
132 |     "    print(k)\n",
133 |     "    if isinstance(v, torch.Tensor):\n",
134 |     "        assert torch.equal(v, quant_state_rank1.as_dict()[k])\n",
135 |     "    else:\n",
136 |     "        assert v == quant_state_rank1.as_dict()[k]"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "id": "047fac20-47b5-416d-9948-0d3f1eb39233",
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "quantized_param = Params4bit(data=params_rank0[0], \n",
147 |     "                               requires_grad=False, \n",
148 |     "                               quant_state=quant_state_rank0,\n",
149 |     "                               quant_type=quant_state_rank0.quant_type,\n",
150 |     "                               quant_storage=params_rank0[0].dtype, \n",
151 |     "                               bnb_quantized=True)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "id": "adbba6bd-b52d-45fc-a695-1b7fc508abaa",
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "quant_state_rank0.to(\"cuda\");"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "id": "7aca49d3-1248-4370-aa66-fce1a8a34a81",
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "data": {
172 |       "text/plain": [
173 |        "{'quant_type': 'nf4',\n",
174 |        " 'absmax': tensor([230, 149,  74,  ..., 194, 175, 203], device='cuda:0',\n",
175 |        "        dtype=torch.uint8),\n",
176 |        " 'blocksize': 64,\n",
177 |        " 'quant_map': tensor([-1.0000, -0.6962, -0.5251, -0.3949, -0.2844, -0.1848, -0.0911,  0.0000,\n",
178 |        "          0.0796,  0.1609,  0.2461,  0.3379,  0.4407,  0.5626,  0.7230,  1.0000]),\n",
179 |        " 'dtype': 'bfloat16',\n",
180 |        " 'shape': (8192, 8192),\n",
181 |        " 'nested_absmax': tensor([0.0736, 0.0258, 0.0224,  ..., 0.0658, 0.0902, 0.0638], device='cuda:0'),\n",
182 |        " 'nested_blocksize': 256,\n",
183 |        " 'nested_quant_map': tensor([-9.9297e-01, -9.7891e-01, -9.6484e-01, -9.5078e-01, -9.3672e-01,\n",
184 |        "         -9.2266e-01, -9.0859e-01, -8.9453e-01, -8.8047e-01, -8.6641e-01,\n",
185 |        "         -8.5234e-01, -8.3828e-01, -8.2422e-01, -8.1016e-01, -7.9609e-01,\n",
186 |        "         -7.8203e-01, -7.6797e-01, -7.5391e-01, -7.3984e-01, -7.2578e-01,\n",
187 |        "         -7.1172e-01, -6.9766e-01, -6.8359e-01, -6.6953e-01, -6.5547e-01,\n",
188 |        "         -6.4141e-01, -6.2734e-01, -6.1328e-01, -5.9922e-01, -5.8516e-01,\n",
189 |        "         -5.7109e-01, -5.5703e-01, -5.4297e-01, -5.2891e-01, -5.1484e-01,\n",
190 |        "         -5.0078e-01, -4.8672e-01, -4.7266e-01, -4.5859e-01, -4.4453e-01,\n",
191 |        "         -4.3047e-01, -4.1641e-01, -4.0234e-01, -3.8828e-01, -3.7422e-01,\n",
192 |        "         -3.6016e-01, -3.4609e-01, -3.3203e-01, -3.1797e-01, -3.0391e-01,\n",
193 |        "         -2.8984e-01, -2.7578e-01, -2.6172e-01, -2.4766e-01, -2.3359e-01,\n",
194 |        "         -2.1953e-01, -2.0547e-01, -1.9141e-01, -1.7734e-01, -1.6328e-01,\n",
195 |        "         -1.4922e-01, -1.3516e-01, -1.2109e-01, -1.0703e-01, -9.8594e-02,\n",
196 |        "         -9.5781e-02, -9.2969e-02, -9.0156e-02, -8.7344e-02, -8.4531e-02,\n",
197 |        "         -8.1719e-02, -7.8906e-02, -7.6094e-02, -7.3281e-02, -7.0469e-02,\n",
198 |        "         -6.7656e-02, -6.4844e-02, -6.2031e-02, -5.9219e-02, -5.6406e-02,\n",
199 |        "         -5.3594e-02, -5.0781e-02, -4.7969e-02, -4.5156e-02, -4.2344e-02,\n",
200 |        "         -3.9531e-02, -3.6719e-02, -3.3906e-02, -3.1094e-02, -2.8281e-02,\n",
201 |        "         -2.5469e-02, -2.2656e-02, -1.9844e-02, -1.7031e-02, -1.4219e-02,\n",
202 |        "         -1.1406e-02, -9.7187e-03, -9.1562e-03, -8.5938e-03, -8.0312e-03,\n",
203 |        "         -7.4687e-03, -6.9063e-03, -6.3437e-03, -5.7813e-03, -5.2188e-03,\n",
204 |        "         -4.6562e-03, -4.0937e-03, -3.5312e-03, -2.9687e-03, -2.4062e-03,\n",
205 |        "         -1.8438e-03, -1.2812e-03, -9.4375e-04, -8.3125e-04, -7.1875e-04,\n",
206 |        "         -6.0625e-04, -4.9375e-04, -3.8125e-04, -2.6875e-04, -1.5625e-04,\n",
207 |        "         -8.8750e-05, -6.6250e-05, -4.3750e-05, -2.1250e-05, -7.7500e-06,\n",
208 |        "         -3.2500e-06, -5.5000e-07,  0.0000e+00,  5.5000e-07,  3.2500e-06,\n",
209 |        "          7.7500e-06,  2.1250e-05,  4.3750e-05,  6.6250e-05,  8.8750e-05,\n",
210 |        "          1.5625e-04,  2.6875e-04,  3.8125e-04,  4.9375e-04,  6.0625e-04,\n",
211 |        "          7.1875e-04,  8.3125e-04,  9.4375e-04,  1.2812e-03,  1.8438e-03,\n",
212 |        "          2.4062e-03,  2.9687e-03,  3.5312e-03,  4.0937e-03,  4.6562e-03,\n",
213 |        "          5.2188e-03,  5.7813e-03,  6.3437e-03,  6.9063e-03,  7.4687e-03,\n",
214 |        "          8.0312e-03,  8.5938e-03,  9.1562e-03,  9.7187e-03,  1.1406e-02,\n",
215 |        "          1.4219e-02,  1.7031e-02,  1.9844e-02,  2.2656e-02,  2.5469e-02,\n",
216 |        "          2.8281e-02,  3.1094e-02,  3.3906e-02,  3.6719e-02,  3.9531e-02,\n",
217 |        "          4.2344e-02,  4.5156e-02,  4.7969e-02,  5.0781e-02,  5.3594e-02,\n",
218 |        "          5.6406e-02,  5.9219e-02,  6.2031e-02,  6.4844e-02,  6.7656e-02,\n",
219 |        "          7.0469e-02,  7.3281e-02,  7.6094e-02,  7.8906e-02,  8.1719e-02,\n",
220 |        "          8.4531e-02,  8.7344e-02,  9.0156e-02,  9.2969e-02,  9.5781e-02,\n",
221 |        "          9.8594e-02,  1.0703e-01,  1.2109e-01,  1.3516e-01,  1.4922e-01,\n",
222 |        "          1.6328e-01,  1.7734e-01,  1.9141e-01,  2.0547e-01,  2.1953e-01,\n",
223 |        "          2.3359e-01,  2.4766e-01,  2.6172e-01,  2.7578e-01,  2.8984e-01,\n",
224 |        "          3.0391e-01,  3.1797e-01,  3.3203e-01,  3.4609e-01,  3.6016e-01,\n",
225 |        "          3.7422e-01,  3.8828e-01,  4.0234e-01,  4.1641e-01,  4.3047e-01,\n",
226 |        "          4.4453e-01,  4.5859e-01,  4.7266e-01,  4.8672e-01,  5.0078e-01,\n",
227 |        "          5.1484e-01,  5.2891e-01,  5.4297e-01,  5.5703e-01,  5.7109e-01,\n",
228 |        "          5.8516e-01,  5.9922e-01,  6.1328e-01,  6.2734e-01,  6.4141e-01,\n",
229 |        "          6.5547e-01,  6.6953e-01,  6.8359e-01,  6.9766e-01,  7.1172e-01,\n",
230 |        "          7.2578e-01,  7.3984e-01,  7.5391e-01,  7.6797e-01,  7.8203e-01,\n",
231 |        "          7.9609e-01,  8.1016e-01,  8.2422e-01,  8.3828e-01,  8.5234e-01,\n",
232 |        "          8.6641e-01,  8.8047e-01,  8.9453e-01,  9.0859e-01,  9.2266e-01,\n",
233 |        "          9.3672e-01,  9.5078e-01,  9.6484e-01,  9.7891e-01,  9.9297e-01,\n",
234 |        "          1.0000e+00], device='cuda:0'),\n",
235 |        " 'nested_dtype': 'float32',\n",
236 |        " 'nested_offset': 0.03480497747659683}"
237 |       ]
238 |      },
239 |      "execution_count": null,
240 |      "metadata": {},
241 |      "output_type": "execute_result"
242 |     }
243 |    ],
244 |    "source": [
245 |     "quant_state_rank0.as_dict()"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "id": "61efa86b-cbb5-4e23-80b8-34489bc47785",
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "data = params_rank0[0].data.to(\"cuda\")"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "id": "2c440294-52ba-4fca-959b-1d59138ba0bc",
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "dequantized_weight = F.dequantize_4bit(data, quant_state_rank0)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "id": "d189849e-cdf3-4321-be2c-6cd53a694385",
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "# put here the model name used to save the summoned weights\n",
276 |     "model_name = \"codellama/CodeLlama-34b-hf\""
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "id": "a6151cc2-3f42-41be-b608-cc1d2738c336",
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "idx = hub.cached_file(model_name, SAFE_WEIGHTS_INDEX_NAME)\n",
287 |     "files, _ = hub.get_checkpoint_shard_files(model_name, idx)\n",
288 |     "orig_weight = None\n",
289 |     "for filename in files:\n",
290 |     "    weights = safetensors.torch.load_file(filename)\n",
291 |     "    for name, param in weights.items():\n",
292 |     "        if name == \"model.layers.0.self_attn.q_proj.weight\":\n",
293 |     "            orig_weight = param\n",
294 |     "            break"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "id": "af94ca28-6ecc-4861-8a45-c98fb22c2f24",
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "# some devation is expected from dequantization\n",
305 |     "# Taken from : peft/tests/.../test_4bit_merge_and_disable_lora - Stricter tolerance values needed?\n",
306 |     "assert torch.allclose(dequantized_weight.cpu(), orig_weight, atol=0.01, rtol=10)"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "id": "b3531c52-a685-42c9-91da-c998b4cd10c5",
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": []
316 |   }
317 |  ],
318 |  "metadata": {
319 |   "kernelspec": {
320 |    "display_name": "python3",
321 |    "language": "python",
322 |    "name": "python3"
323 |   }
324 |  },
325 |  "nbformat": 4,
326 |  "nbformat_minor": 5
327 | }
328 | 


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/fsdp/fsdp_qlora/scripts/__init__.py


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/scripts/block_expansion.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | from transformers import AutoConfig
 4 | import torch
 5 | from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
 6 | import safetensors
 7 | from safetensors.torch import save_file
 8 | import os 
 9 | from pathlib import Path
10 | 
11 | def main():
12 |     # Set up the argument parser
13 |     parser = argparse.ArgumentParser(description="Receive deepen model's args")
14 |     parser.add_argument("--model_name", default='meta-llama/Llama-2-7b-hf', type=str, help="original model path")
15 |     parser.add_argument("--output_dir", default=None, type=str, help="deepened model ckpt save path")
16 |     parser.add_argument("--expansion_rate", default=0.1, type=float, help="add new trainable % of layers")
17 | 
18 |     # Parse the arguments
19 |     args = parser.parse_args()
20 |         
21 |     idx = hub.cached_file(args.model_name, SAFE_WEIGHTS_INDEX_NAME)
22 |     files, _ = hub.get_checkpoint_shard_files(args.model_name, idx)
23 |     
24 |     cfg = AutoConfig.from_pretrained(args.model_name)
25 |     num_original_layers = cfg.num_hidden_layers
26 |     new_layers = num_original_layers + int(num_original_layers * args.expansion_rate)
27 |     
28 |     split = int(num_original_layers / (new_layers - num_original_layers))
29 |     
30 |     if args.output_dir is None:
31 |         output_dir = Path(os.environ['HOME'])/'models'/(args.model_name + f'_blk_exp-{num_original_layers}-{new_layers}')
32 |     else:
33 |         output_dir = Path(args.output_dir)/(args.model_name + f'_blk_exp-{num_original_layers}-{new_layers}')
34 |     os.makedirs(output_dir, exist_ok=True)
35 |     
36 |     for filename in files:
37 |         weights = safetensors.torch.load_file(filename)
38 |         expanded_weights = {}
39 |         for k,v in iter(weights.items()):
40 |             if 'layers' in k:
41 |                 layer_no = int(k.split('layers.')[1].split('.')[0])
42 |                 # shift existing layers
43 |                 new_layer_no = layer_no + layer_no // split
44 |                 new_k = k.replace(f'layers.{layer_no}', f'layers.{new_layer_no}')
45 |                 expanded_weights[new_k] = v
46 |                 # add new layers
47 |                 if (layer_no+1) % split == 0:
48 |                     new_layer_no += 1
49 |                     new_k = k.replace(f'layers.{layer_no}', f'layers.{new_layer_no}')
50 |                     if 'down_proj' in k or 'o_proj' in k:
51 |                         expanded_weights[new_k] = torch.zeros_like(v)     
52 |                     else:
53 |                         expanded_weights[new_k] = v.clone()
54 |             else:
55 |                 expanded_weights[k] = v
56 |         save_file(expanded_weights, output_dir/Path(filename).name)
57 |     
58 | 
59 | if __name__ == "__main__":
60 |     main()


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/scripts/dora.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import bitsandbytes as bnb
  4 | 
  5 | # Wrapping policy requires modules, base_layer has no grad params, lora_A, lora_B, dora_scale have grad params.
  6 | class DORALayer(nn.Module):
  7 |     "Same as LORA but also returnes weight norm. This will be wrapped as a single FSDP unit"
  8 |     def __init__(self, in_features, out_features, lora_rank, device, dtype, *args, **kwargs):
  9 |         super().__init__()
 10 |         # Init LoRA layers.
 11 |         std_dev = 1 / torch.sqrt(torch.tensor(lora_rank).float())
 12 |         lora_A_param = nn.Parameter(torch.randn(lora_rank, in_features).to(device=device, dtype=dtype)*std_dev)
 13 |         self.lora_A = nn.Linear(in_features, lora_rank, bias=False, device=device, dtype=dtype)
 14 |         setattr(self.lora_A, "weight", lora_A_param)
 15 |         
 16 |         self.lora_B = nn.Linear(lora_rank, out_features, bias=False, device=device, dtype=dtype)
 17 |         self.lora_B.weight.data.zero_()
 18 |     
 19 |     def forward(self, x, frozen_weight):
 20 |         output = self.lora_B(self.lora_A(x))
 21 |         # print("lora A shape:", self.lora_A.weight.shape)
 22 |         # print("lora B shape:", self.lora_B.weight.shape)
 23 |         # DoRA Section 4.3. Detach column norm to avoid backprop through it.
 24 |         column_norm = (frozen_weight + self.lora_B.weight @ self.lora_A.weight).norm(p=2, dim=1).detach()
 25 |         # print("column norm shape:", column_norm.shape, column_norm.shape)
 26 |         return output, column_norm
 27 |     
 28 | class MagnitudeLayer(nn.Module):
 29 |     "FSDP doesn't work with nn.ParameterDict hence this module: https://github.com/pytorch/pytorch/issues/79605"
 30 |     def __init__(self, vector_data, device, dtype):
 31 |         super().__init__()
 32 |         self.magnitude = nn.Parameter(vector_data.to(device=device, dtype=dtype))
 33 |         
 34 |     def forward(self, x):
 35 |         return x * self.magnitude.view(1,1,-1)
 36 |     
 37 | class HQQDORA(nn.Module):
 38 |     def __init__(self, base_layer, lora_rank, *args, **kwargs):
 39 |         super().__init__()
 40 |         self.base_layer = base_layer
 41 |         dtype = getattr(base_layer, "compute_dtype", next(base_layer.parameters()).dtype)
 42 |         device = next(base_layer.parameters()).device
 43 |         
 44 |         # Init trainable magnitude parameter.
 45 |         self.magnitude_layer = MagnitudeLayer(self.base_layer.dora_scale.clone().to(dtype=dtype), device, dtype)
 46 |         self.base_layer.dora_scale = None
 47 |         torch.cuda.empty_cache()
 48 |         
 49 |         # Init DORA layers.
 50 |         self.dora_layer = DORALayer(base_layer.in_features, base_layer.out_features, lora_rank, device, dtype, *args, **kwargs)
 51 | 
 52 |     def forward(self, x, *args, **kwargs):
 53 |         result = self.base_layer(x, *args, **kwargs)
 54 |         # As per Tim Dettmers, for 4bit, we need to defensively clone here.
 55 |         # The reason is that in some cases, an error can occur that backprop
 56 |         # does not work on a manipulated view. This issue may be solved with
 57 |         # newer PyTorch versions but this would need extensive testing to be
 58 |         # sure.
 59 |         result = result.clone()
 60 | 
 61 |         requires_conversion = not torch.is_autocast_enabled()
 62 |         if requires_conversion:
 63 |             expected_dtype = result.dtype
 64 |             x = x.to(self.dora_layer.lora_A.weight.dtype)
 65 | 
 66 |         # m * (W + AB / ||W + AB||) @ X == m * ((W @ X + AB @ X) / ||W + AB||)
 67 |         output, column_norm = self.dora_layer(x, self.base_layer.dequantize_aten())
 68 |         if requires_conversion:
 69 |             output = output.to(expected_dtype)
 70 |         
 71 |         result += output        
 72 |         result = result / column_norm.view(1,1,-1) #unit vector result.
 73 |         result = self.magnitude_layer(result) #rescaled result.
 74 |         return result
 75 |     
 76 | class BNBDORA(nn.Module):
 77 |     def __init__(self, base_layer, lora_rank, *args, **kwargs):
 78 |         super().__init__()
 79 |         self.base_layer = base_layer
 80 |         dtype = getattr(base_layer, "compute_dtype", next(base_layer.parameters()).dtype)
 81 |         device = next(base_layer.parameters()).device
 82 |         
 83 |         # Init trainable magnitude parameter.
 84 |         self.magnitude_layer = MagnitudeLayer(self.base_layer.dora_scale.clone().to(dtype=dtype), device, dtype)
 85 |         self.base_layer.dora_scale = None
 86 |         torch.cuda.empty_cache()
 87 |         
 88 |         # Init DORA layers.
 89 |         self.dora_layer = DORALayer(base_layer.in_features, base_layer.out_features, lora_rank, device, dtype, *args, **kwargs)
 90 | 
 91 |     def forward(self, x, *args, **kwargs):
 92 |         result = self.base_layer(x, *args, **kwargs)
 93 |         # As per Tim Dettmers, for 4bit, we need to defensively clone here.
 94 |         # The reason is that in some cases, an error can occur that backprop
 95 |         # does not work on a manipulated view. This issue may be solved with
 96 |         # newer PyTorch versions but this would need extensive testing to be
 97 |         # sure.
 98 |         result = result.clone()
 99 | 
100 |         requires_conversion = not torch.is_autocast_enabled()
101 |         if requires_conversion:
102 |             expected_dtype = result.dtype
103 |             x = x.to(self.dora_layer.lora_A.weight.dtype)
104 | 
105 |         # m * (W + AB / ||W + AB||) @ X == m * ((W @ X + AB @ X) / ||W + AB||)
106 |         output, column_norm = self.dora_layer(x, bnb.functional.dequantize_4bit(self.base_layer.weight.data, 
107 |                                                                                 self.base_layer.weight.quant_state))
108 |         if requires_conversion:
109 |             output = output.to(expected_dtype)
110 |         
111 |         result += output        
112 |         result = result / column_norm.view(1,1,-1) #unit vector result.
113 |         result = self.magnitude_layer(result) #rescaled result.
114 |         return result


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/scripts/lora.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class LORA(nn.Module):
 5 |     def __init__(self, base_layer, lora_rank, lora_alpha, lora_dropout):
 6 |         super().__init__()
 7 |         self.base_layer = base_layer
 8 |         dtype = getattr(base_layer, "compute_dtype", next(base_layer.parameters()).dtype)
 9 |         device = next(base_layer.parameters()).device
10 |         lora_A = nn.Linear(base_layer.in_features, lora_rank, bias=False, device=device, dtype=dtype)
11 |         lora_B = nn.Linear(lora_rank, base_layer.out_features, bias=False, device=device, dtype=dtype)
12 |         lora_B.weight.data.zero_()
13 | 
14 |         self.lora_AB = nn.Sequential(lora_A, lora_B)
15 | 
16 |         self.lora_alpha = lora_alpha
17 |         self.lora_dropout = nn.Dropout(lora_dropout)
18 |         self.scaling = self.lora_alpha / lora_rank
19 | 
20 |     def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
21 | 
22 |         result = self.base_layer(x, *args, **kwargs)
23 |         # As per Tim Dettmers, for 4bit, we need to defensively clone here.
24 |         # The reason is that in some cases, an error can occur that backprop
25 |         # does not work on a manipulated view. This issue may be solved with
26 |         # newer PyTorch versions but this would need extensive testing to be
27 |         # sure.
28 |         result = result.clone()
29 | 
30 |         requires_conversion = not torch.is_autocast_enabled()
31 |         if requires_conversion:
32 |             expected_dtype = result.dtype
33 |             x = x.to(next(iter(self.lora_AB)).weight.dtype)
34 | 
35 |         output = self.lora_AB(self.lora_dropout(x))
36 |         if requires_conversion:
37 |             output = output.to(expected_dtype)
38 |         output = output * self.scaling
39 | 
40 |         result += output
41 | 
42 |         return result


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/table1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Continue on error
 4 | set +e
 5 | 
 6 | # List of commands
 7 | commands=(
 8 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 512 --num_epochs 1 --train_type qlora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
 9 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type qlora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
10 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 2048 --num_epochs 1 --train_type qlora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
11 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 4096 --num_epochs 1 --train_type qlora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
12 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 512 --num_epochs 1 --train_type lora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
13 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type lora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
14 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 2048 --num_epochs 1 --train_type lora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
15 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 4096 --num_epochs 1 --train_type lora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
16 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type lora --use_gradient_checkpointing False --use_cpu_offload False --log_to wandb --dataset dummy"
17 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 4096 --num_epochs 1 --train_type lora --use_gradient_checkpointing False --use_cpu_offload False --log_to wandb --dataset dummy"
18 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type qlora --use_gradient_checkpointing False --use_cpu_offload False --log_to wandb --dataset dummy"
19 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 4096 --num_epochs 1 --train_type qlora --use_gradient_checkpointing False --use_cpu_offload False --log_to wandb --dataset dummy"
20 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type lora --use_gradient_checkpointing False --use_cpu_offload True --log_to wandb --dataset dummy"
21 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type lora --use_gradient_checkpointing True --use_cpu_offload True --log_to wandb --dataset dummy"
22 |      "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type qlora --use_gradient_checkpointing False --use_cpu_offload True --log_to wandb --dataset dummy"
23 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type qlora --use_gradient_checkpointing True --use_cpu_offload True --log_to wandb --dataset dummy"
24 | )
25 | 
26 | # Execute each command
27 | for cmd in "${commands[@]}"; do
28 |     echo "Executing: $cmd"
29 |     $cmd
30 | done
31 | 
32 | # Optional: stop on error for subsequent commands
33 | set -e


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/tests/test_block_expansion.py:
--------------------------------------------------------------------------------
 1 | import unittest, tempfile
 2 | import torch
 3 | import torch.nn as nn
 4 | import safetensors
 5 | from safetensors.torch import save_file
 6 | from pathlib import Path
 7 | from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
 8 | from glob import glob 
 9 | 
10 | # python -m unittest tests.test_quantize.TestQuantizer.test_quantizer
11 | class TestBlockExpansion(unittest.TestCase):
12 | 
13 |     def setUp(self) -> None:
14 |         # set seed        
15 |         self.llama_pro_path = Path("/mnt/vol_b/models/meta-llama/Llama-2-7b-hf_blk_exp-32-35")
16 |         self.filenames = glob(str(self.llama_pro_path/"*.safetensors"))
17 |         num_original_layers, num_expanded_layers = self.llama_pro_path.name.split("blk_exp-")[1].split("-")
18 |         self.num_original_layers, self.num_expanded_layers = int(num_original_layers), int(num_expanded_layers)
19 |         self.split = int(self.num_original_layers / (self.num_expanded_layers - self.num_original_layers))
20 | 
21 |         
22 |     def tearDown(self) -> None:
23 |         return super().tearDown()
24 |     
25 |     def test_expanded_weights(self):   
26 |         
27 |         total_new_layers = self.num_expanded_layers - self.num_original_layers
28 |         new_layer_ids = [self.split + (self.split + 1)*n for n in range(total_new_layers)]
29 |         
30 |         verify_weights = {}
31 |         for filename in self.filenames:
32 |             weights = safetensors.torch.load_file(str(filename))
33 |             for k,v in iter(weights.items()):
34 |                 if any(((f"layers.{i}" in k) or (f"layers.{i-1}" in k) for i in new_layer_ids)):
35 |                     verify_weights[k] = v
36 |                     
37 |         for k,v in verify_weights.items():
38 |             if any(((f"layers.{i}" in k) for i in new_layer_ids)):
39 |                 if 'down_proj' in k or 'o_proj' in k:
40 |                     assert torch.equal(v, torch.zeros_like(v))
41 |                 else:
42 |                     lid = int(k.split("layers.")[1].split(".")[0])
43 |                     assert torch.equal(verify_weights[k.replace(f"layers.{lid}", f"layers.{lid-1}")], v)
44 |                 


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/tests/test_dora.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('../scripts/')
 3 | import unittest, tempfile
 4 | from hqq.core.quantize import HQQLinear, HQQBackend, BaseQuantizeConfig
 5 | import torch
 6 | import torch.nn as nn
 7 | from dora import HQQDORA, BNBDORA
 8 | 
 9 | import bitsandbytes
10 | import bitsandbytes as bnb
11 | from bitsandbytes.nn import Linear4bit
12 | 
13 | 
14 | # python -m unittest tests.test_quantize.TestQuantizer.test_quantizer
15 | # hqq pinned: 72b2b641aadc44a7ded6b243915f90df3b3be385 (to_empty not compatible with FSDP after this commit)
16 | class TestHQQDORA(unittest.TestCase):
17 | 
18 |     def setUp(self) -> None:
19 |         # set seed
20 |         torch.manual_seed(42)
21 |         quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_zero=True,
22 |                                   quant_scale=True, offload_meta=True, view_as_float=True)
23 |         self.base_layer = HQQLinear(nn.Linear(128,256,bias=False), quant_config, compute_dtype=torch.float32)
24 |         HQQLinear.set_backend(HQQBackend.ATEN_BACKPROP)
25 |         return super().setUp()
26 |     
27 |     def tearDown(self) -> None:
28 |         return super().tearDown()
29 |     
30 |     def test_hqq_dora(self):   
31 |         """
32 |         Test:  m * (W + AB / ||W + AB||) @ X == m * ((W @ X + AB @ X) / ||W + AB||)
33 |         """
34 |         frozen_weight = self.base_layer.dequantize_aten().clone().cuda()
35 |         self.base_layer.dora_scale = frozen_weight.norm(p=2,dim=1)
36 |         hqq_dora = HQQDORA(self.base_layer, 16)
37 |         weight = (frozen_weight + hqq_dora.dora_layer.lora_B.weight @ hqq_dora.dora_layer.lora_A.weight)
38 |         norm_adapted = weight / weight.norm(p=2, dim=1).view(-1,1)
39 |         calc_weights = norm_adapted * hqq_dora.magnitude_layer.magnitude.view(-1,1)
40 |         x = torch.randn(1, 16,128).cuda().to(torch.float32)
41 |         closeness = torch.isclose(x @ calc_weights.t(), hqq_dora(x)).float().mean().item()
42 |         self.assertTrue(closeness > 0.99)
43 |         
44 | class TestBNBDORA(unittest.TestCase):
45 | 
46 |     def setUp(self) -> None:
47 |         # set seed
48 |         torch.manual_seed(42)
49 |         self.base_layer = Linear4bit(128, 32, bias=False, quant_type="nf4", 
50 |                                      compute_dtype=torch.float32, quant_storage=torch.float32)
51 |         return super().setUp()
52 |     
53 |     def tearDown(self) -> None:
54 |         return super().tearDown()
55 |     
56 |     def test_bnb_dora(self):   
57 |         """
58 |         Test:  m * (W + AB / ||W + AB||) @ X == m * ((W @ X + AB @ X) / ||W + AB||)
59 |         """
60 |         # quantize and dequantize to avoid numerical mismatch during test.
61 |         self.base_layer.cuda()
62 |         frozen_weight = bnb.functional.dequantize_4bit(self.base_layer.weight.data, 
63 |                                                        self.base_layer.weight.quant_state).clone()
64 |         self.base_layer.dora_scale = frozen_weight.norm(p=2,dim=1)
65 |         bnb_dora = BNBDORA(self.base_layer, 16).cuda()
66 |         
67 |         weight = (frozen_weight + bnb_dora.dora_layer.lora_B.weight @ bnb_dora.dora_layer.lora_A.weight)
68 |         norm_adapted = weight / weight.norm(p=2, dim=1).view(-1,1)
69 |         calc_weights = norm_adapted * bnb_dora.magnitude_layer.magnitude.view(-1,1)
70 |         x = torch.randn(1, 16,128).cuda().to(torch.float32)
71 |         closeness = torch.isclose(x @ calc_weights.t(), bnb_dora(x)).float().mean().item() 
72 |         self.assertTrue(closeness > 0.99)


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/train.sh:
--------------------------------------------------------------------------------
  1 | # Compare LORA and QLORA on Alpaca dataset with same effective batch size ~32, lr sched, and lr.
  2 | # Reference for some hyperparams: https://arxiv.org/abs/2305.14314
  3 | # LORA (pure bf16)
  4 | # https://wandb.ai/answerdotai/fsdp/runs/gb34o6p4?workspace=user-k-answer-ai
  5 | # NOTE: Loss curve is flat - 1) use lower lr ? 2) start immediate annealing get_cosine_one_cycle_scheduler(..., min_lr_fraction=0.0)
  6 | python train.py \
  7 | --model_name meta-llama/Llama-2-7b-hf \
  8 | --gradient_accumulation_steps 2 \
  9 | --batch_size 8 \
 10 | --context_length 512 \
 11 | --num_epochs 1 \
 12 | --train_type lora \
 13 | --use_gradient_checkpointing False \
 14 | --use_cpu_offload False \
 15 | --log_to wandb \
 16 | --dataset alpaca \
 17 | --verbose false \
 18 | --save_model true \
 19 | --output_dir ~/models/lora_alpaca
 20 | 
 21 | # QLORA (pure bf16)
 22 | python train.py \
 23 | --model_name meta-llama/Llama-2-7b-hf \
 24 | --gradient_accumulation_steps 2 \
 25 | --batch_size 8 \
 26 | --context_length 512 \
 27 | --num_epochs 1 \
 28 | --train_type qlora \
 29 | --use_gradient_checkpointing False \
 30 | --use_cpu_offload False \
 31 | --log_to wandb \
 32 | --dataset alpaca \
 33 | --verbose false \
 34 | --save_model true \
 35 | --output_dir ~/models/qlora_alpaca
 36 | 
 37 | # QLORA (autocast bf16)
 38 | python train.py \
 39 | --model_name meta-llama/Llama-2-7b-hf \
 40 | --precision bf16_buffers_autocast \
 41 | --gradient_accumulation_steps 2 \
 42 | --batch_size 8 \
 43 | --context_length 512 \
 44 | --num_epochs 1 \
 45 | --train_type qlora \
 46 | --use_gradient_checkpointing False \
 47 | --use_cpu_offload False \
 48 | --log_to wandb \
 49 | --dataset alpaca \
 50 | --verbose false \
 51 | --save_model true \
 52 | --output_dir ~/models/qlora_alpaca_autocast_buffers_bf16
 53 | 
 54 | # stop instance
 55 | # requires: az login --use-device-code
 56 | az vm deallocate -g resource-group-us-east -n a100-duo
 57 | 
 58 | export CUDA_VISIBLE_DEVICES=3,4
 59 | python train.py \
 60 | --world_size 2 \
 61 | --model_name meta-llama/Llama-2-7b-hf \
 62 | --gradient_accumulation_steps 2 \
 63 | --batch_size 1 \
 64 | --context_length 512 \
 65 | --num_epochs 1 \
 66 | --sharding_strategy full_shard \
 67 | --precision bf16 \
 68 | --train_type hqq_lora \
 69 | --use_gradient_checkpointing true \
 70 | --use_cpu_offload false \
 71 | --log_to stdout \
 72 | --dataset alpaca \
 73 | --verbose true
 74 | 
 75 | export CUDA_VISIBLE_DEVICES=4,5
 76 | python train.py \
 77 | --world_size 2 \
 78 | --master_port 12356 \
 79 | --model_name meta-llama/Llama-2-7b-hf \
 80 | --gradient_accumulation_steps 2 \
 81 | --batch_size 1 \
 82 | --context_length 512 \
 83 | --num_epochs 1 \
 84 | --sharding_strategy full_shard \
 85 | --precision bf16 \
 86 | --train_type hqq_lora \
 87 | --use_gradient_checkpointing true \
 88 | --use_cpu_offload false \
 89 | --log_to stdout \
 90 | --dataset dummy \
 91 | --verbose true
 92 | 
 93 | export CUDA_VISIBLE_DEVICES=3,4
 94 | python train.py \
 95 | --world_size 3 \
 96 | --model_name meta-llama/Llama-2-70b-hf \
 97 | --gradient_accumulation_steps 2 \
 98 | --batch_size 1 \
 99 | --context_length 4096 \
100 | --num_epochs 1 \
101 | --sharding_strategy full_shard \
102 | --precision bf16 \
103 | --train_type hqq_dora \
104 | --use_gradient_checkpointing true \
105 | --use_cpu_offload false \
106 | --log_to wandb \
107 | --dataset dummy \
108 | --verbose true      


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/train_hqq_bench.sh:
--------------------------------------------------------------------------------
  1 | # Full vs QLORA vs HQQ, batch size = 64
  2 | 
  3 | # Full
  4 | # max batch size / gpu = 8 (38/40 GB)
  5 | # 8 * 2 gpus * 4 grad accum  = 64
  6 | export CUDA_VISIBLE_DEVICES=4,5
  7 | python train.py \
  8 | --world_size 2 \
  9 | --master_port 12356 \
 10 | --model_name meta-llama/Llama-2-7b-hf \
 11 | --gradient_accumulation_steps 4 \
 12 | --batch_size 8 \
 13 | --context_length 512 \
 14 | --precision bf16 \
 15 | --train_type full \
 16 | --use_gradient_checkpointing true \
 17 | --use_cpu_offload false \
 18 | --log_to wandb \
 19 | --dataset alpaca \
 20 | --verbose true
 21 | 
 22 | # BnB (QLORA)
 23 | # max batch size / gpu = 16 (28/40 GB)
 24 | # 16 * 2 gpus * 2 grad accum  = 64
 25 | export CUDA_VISIBLE_DEVICES=4,5
 26 | python train.py \
 27 | --world_size 2 \
 28 | --master_port 12356 \
 29 | --model_name meta-llama/Llama-2-7b-hf \
 30 | --gradient_accumulation_steps 2 \
 31 | --batch_size 16 \
 32 | --context_length 512 \
 33 | --precision bf16 \
 34 | --train_type custom_qlora \
 35 | --use_gradient_checkpointing true \
 36 | --use_cpu_offload false \
 37 | --log_to wandb \
 38 | --dataset alpaca \
 39 | --verbose true
 40 | 
 41 | # HQQ (QLORA)
 42 | # max batch size / gpu = 32 (28/40 GB)
 43 | # 32 * 2 gpus = 64
 44 | export CUDA_VISIBLE_DEVICES=4,5
 45 | python train.py \
 46 | --world_size 2 \
 47 | --master_port 12356 \
 48 | --model_name meta-llama/Llama-2-7b-hf \
 49 | --gradient_accumulation_steps 1 \
 50 | --batch_size 32 \
 51 | --context_length 512 \
 52 | --precision bf16 \
 53 | --train_type hqq_lora \
 54 | --use_gradient_checkpointing true \
 55 | --use_cpu_offload false \
 56 | --log_to wandb \
 57 | --dataset alpaca \
 58 | --verbose true
 59 | 
 60 | # DORA: max batch size / gpu = 32 (28/40 GB)
 61 | # 32 * 2 gpus = 64
 62 | export CUDA_VISIBLE_DEVICES=6,7
 63 | python train.py \
 64 | --world_size 2 \
 65 | --master_port 12357 \
 66 | --model_name meta-llama/Llama-2-7b-hf \
 67 | --gradient_accumulation_steps 1 \
 68 | --batch_size 32 \
 69 | --context_length 512 \
 70 | --precision bf16 \
 71 | --train_type hqq_dora \
 72 | --use_gradient_checkpointing true \
 73 | --use_cpu_offload false \
 74 | --log_to stdout \
 75 | --dataset alpaca \
 76 | --verbose true
 77 | 
 78 | 
 79 | # 32 * 2 gpus = 64
 80 | export CUDA_VISIBLE_DEVICES=2,6
 81 | python train.py \
 82 | --world_size 2 \
 83 | --master_port 12356 \
 84 | --model_name meta-llama/Llama-2-7b-hf \
 85 | --gradient_accumulation_steps 1 \
 86 | --batch_size 32 \
 87 | --context_length 512 \
 88 | --precision bf16 \
 89 | --train_type hqq_lora \
 90 | --use_gradient_checkpointing true \
 91 | --use_cpu_offload false \
 92 | --log_to stdout \
 93 | --dataset dummy \
 94 | --verbose true \
 95 | --save_model true \
 96 | --output_dir /weka/home-keremturgutlu/models/hqq_lora_dummy
 97 | 
 98 | export CUDA_VISIBLE_DEVICES=2,6
 99 | python train.py \
100 | --lr 1e-3 \
101 | --world_size 2 \
102 | --master_port 12356 \
103 | --model_name meta-llama/Llama-2-7b-hf \
104 | --gradient_accumulation_steps 1 \
105 | --batch_size 32 \
106 | --context_length 512 \
107 | --precision bf16 \
108 | --train_type custom_qlora \
109 | --use_gradient_checkpointing true \
110 | --use_cpu_offload false \
111 | --log_to stdout \
112 | --dataset dummy \
113 | --verbose true \
114 | --save_model true \
115 | --output_dir /weka/home-keremturgutlu/models/qlora_dummy
116 | 
117 | 
118 | # BNB 70B
119 | export CUDA_VISIBLE_DEVICES=4,5,6,7
120 | python train.py \
121 | --world_size 4 \
122 | --master_port 12356 \
123 | --model_name meta-llama/Llama-2-70b-hf \
124 | --gradient_accumulation_steps 4 \
125 | --batch_size 2 \
126 | --context_length 512 \
127 | --precision bf16_buffers_autocast \
128 | --train_type custom_qlora \
129 | --use_gradient_checkpointing true \
130 | --use_cpu_offload false \
131 | --log_to stdout \
132 | --dataset alpaca \
133 | --verbose true
134 | 
135 | # HQQ 70B
136 | export CUDA_VISIBLE_DEVICES=4,5,6,7
137 | python train.py \
138 | --world_size 4 \
139 | --master_port 12356 \
140 | --model_name meta-llama/Llama-2-70b-hf \
141 | --gradient_accumulation_steps 4 \
142 | --batch_size 2 \
143 | --context_length 512 \
144 | --precision bf16_buffers_autocast \
145 | --train_type hqq_lora \
146 | --use_gradient_checkpointing true \
147 | --use_cpu_offload false \
148 | --log_to stdout \
149 | --dataset alpaca \
150 | --verbose true


--------------------------------------------------------------------------------
/fsdp/fsdp_qlora/train_sql.sh:
--------------------------------------------------------------------------------
 1 | # run script also show stdout and save to log file
 2 | python train.py \
 3 | --context_length 256 \
 4 | --model_name codellama/CodeLlama-34b-hf \
 5 | --train_type qlora \
 6 | --batch_size 4 \
 7 | --gradient_accumulation_steps 4 \
 8 | --dataset sql \
 9 | --save_model True \
10 | --output_dir sql_model_qlora \
11 | --apply_gradient_clipping True \
12 | --project_name fsdp_qlora_sql \
13 | --precision bf16_buffers_autocast \
14 | --log_to wandb 2>&1 | tee ~/qlora_sql.log
15 | 
16 | python train.py \
17 | --context_length 256 \
18 | --model_name codellama/CodeLlama-34b-hf \
19 | --train_type custom_qlora \
20 | --batch_size 4 \
21 | --gradient_accumulation_steps 4 \
22 | --dataset sql \
23 | --save_model True \
24 | --output_dir sql_model_custom_qlora \
25 | --apply_gradient_clipping True \
26 | --project_name fsdp_qlora_sql \
27 | --precision bf16_buffers_autocast \
28 | --log_to wandb  2>&1 | tee ~/custom_qlora_sql.log


--------------------------------------------------------------------------------
/launcher.txt:
--------------------------------------------------------------------------------
1 | torchrun --nproc_per_node=1 --nnodes=2 --rdzv_id=456 --rdzv_backend=c10d --rdzv_endpoint=66.114.112.70:48123 train.py --batch_size 8 --model_folder "/workspace/runpod-hackathon/parallel/weights"
2 | 


--------------------------------------------------------------------------------
/parallel/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM runpod/worker-vllm:stable-cuda11.8.0
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y unrtf && \
 5 |     rm -rf /var/lib/apt/lists/*
 6 | 
 7 | COPY . .
 8 | 
 9 | RUN python3.10 -m pip install -q --no-cache-dir -r requirements.txt
10 | 
11 | CMD ["python3.10", "-u", "main.py"]
12 | 
13 | 


--------------------------------------------------------------------------------
/parallel/__pycache__/config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/parallel/__pycache__/config.cpython-310.pyc


--------------------------------------------------------------------------------
/parallel/__pycache__/config.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/parallel/__pycache__/config.cpython-311.pyc


--------------------------------------------------------------------------------
/parallel/__pycache__/dataset.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/parallel/__pycache__/dataset.cpython-310.pyc


--------------------------------------------------------------------------------
/parallel/__pycache__/dataset.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/parallel/__pycache__/dataset.cpython-311.pyc


--------------------------------------------------------------------------------
/parallel/__pycache__/fairscale.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/parallel/__pycache__/fairscale.cpython-310.pyc


--------------------------------------------------------------------------------
/parallel/__pycache__/inference.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/parallel/__pycache__/inference.cpython-311.pyc


--------------------------------------------------------------------------------
/parallel/__pycache__/main.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/parallel/__pycache__/main.cpython-310.pyc


--------------------------------------------------------------------------------
/parallel/__pycache__/model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/parallel/__pycache__/model.cpython-310.pyc


--------------------------------------------------------------------------------
/parallel/__pycache__/model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SohamGovande/podplex/7b64f4b7230f4997301d9df82f46b9e8bea0ce1c/parallel/__pycache__/model.cpython-311.pyc


--------------------------------------------------------------------------------
/parallel/config.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from dataclasses import dataclass
 3 | 
 4 | @dataclass
 5 | class ModelConfig:
 6 | 
 7 |     batch_size: int # Batch size
 8 |     num_epochs: int # Number of epochs to train
 9 |     lr: float # Learning rate
10 |     seq_len: int # Sequence length
11 |     d_model: int # Size of the embedding vector
12 |     lang_src: str # Source language
13 |     lang_tgt: str # Target language
14 |     model_folder: str # Folder where to save the checkpoints
15 |     model_basename: str # Basename of the checkpoint files
16 |     preload: str # Preload weights from a previous checkpoint
17 |     tokenizer_file: str # Path where to save the tokenizer
18 |     local_rank: int = -1 # LOCAL_RANK assigned by torchrun
19 |     global_rank: int = -1 # RANK assigned by torchrun
20 | 
21 | def get_default_config() -> ModelConfig:
22 | 
23 |     return ModelConfig(
24 |         batch_size=4,
25 |         num_epochs=30,
26 |         lr=10**-4,
27 |         seq_len=350,
28 |         d_model=512,
29 |         lang_src="en",
30 |         lang_tgt="it",
31 |         model_folder="weights",
32 |         model_basename="tmodel_{0:02d}.pt",
33 |         preload="latest",
34 |         tokenizer_file="tokenizer_{0}.json",
35 |     )
36 | 
37 | def get_weights_file_path(config: ModelConfig, epoch: str) -> str:
38 |     model_folder = config.model_folder
39 |     model_basename = config.model_basename
40 |     model_filename = model_basename.format(epoch)
41 |     return str(Path('.') / model_folder / model_filename)
42 | 
43 | def get_latest_weights_file_path(config: ModelConfig) -> str:
44 |     model_folder = config.model_folder
45 |     model_basename = config.model_basename
46 |     # Check all files in the model folder
47 |     model_files = Path(model_folder).glob(f"*.pt")
48 |     # Sort by epoch number (ascending order)
49 |     model_files = sorted(model_files, key=lambda x: int(x.stem.split('_')[-1]))
50 |     if len(model_files) == 0:
51 |         return None
52 |     # Get the last one
53 |     model_filename = model_files[-1]
54 |     return str(model_filename)


--------------------------------------------------------------------------------
/parallel/dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | 
 4 | class BilingualDataset(Dataset):
 5 | 
 6 |     def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
 7 |         super().__init__()
 8 |         self.seq_len = seq_len
 9 | 
10 |         self.ds = ds
11 |         self.tokenizer_src = tokenizer_src
12 |         self.tokenizer_tgt = tokenizer_tgt
13 |         self.src_lang = src_lang
14 |         self.tgt_lang = tgt_lang
15 | 
16 |         self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
17 |         self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
18 |         self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)
19 | 
20 |     def __len__(self):
21 |         return len(self.ds)
22 | 
23 |     def __getitem__(self, idx: int):
24 |         src_target_pair = self.ds[idx]
25 |         src_text = src_target_pair['translation'][self.src_lang]
26 |         tgt_text = src_target_pair['translation'][self.tgt_lang]
27 | 
28 |         # Transform the text into tokens
29 |         enc_input_tokens = self.tokenizer_src.encode(src_text).ids
30 |         dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
31 | 
32 |         # Add sos, eos and padding to each sentence
33 |         enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
34 |         # We will only add <s>, and </s> only on the label
35 |         dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1
36 | 
37 |         # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
38 |         if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
39 |             raise ValueError("Sentence is too long")
40 | 
41 |         # Add <s> and </s> token
42 |         encoder_input = torch.cat(
43 |             [
44 |                 self.sos_token,
45 |                 torch.tensor(enc_input_tokens, dtype=torch.int64),
46 |                 self.eos_token,
47 |                 torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
48 |             ],
49 |             dim=0,
50 |         )
51 | 
52 |         # Add only <s> token
53 |         decoder_input = torch.cat(
54 |             [
55 |                 self.sos_token,
56 |                 torch.tensor(dec_input_tokens, dtype=torch.int64),
57 |                 torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
58 |             ],
59 |             dim=0,
60 |         )
61 | 
62 |         # Add only </s> token
63 |         label = torch.cat(
64 |             [
65 |                 torch.tensor(dec_input_tokens, dtype=torch.int64),
66 |                 self.eos_token,
67 |                 torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
68 |             ],
69 |             dim=0,
70 |         )
71 | 
72 |         # Double check the size of the tensors to make sure they are all seq_len long
73 |         assert encoder_input.size(0) == self.seq_len
74 |         assert decoder_input.size(0) == self.seq_len
75 |         assert label.size(0) == self.seq_len
76 | 
77 |         return {
78 |             "encoder_input": encoder_input,  # (seq_len)
79 |             "decoder_input": decoder_input,  # (seq_len)
80 |             "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
81 |             "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
82 |             "label": label,  # (seq_len)
83 |             "src_text": src_text,
84 |             "tgt_text": tgt_text,
85 |         }
86 |     
87 | def causal_mask(size: int):
88 |     mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
89 |     return mask == 0


--------------------------------------------------------------------------------
/parallel/fairscale_test.py:
--------------------------------------------------------------------------------
 1 | import fairscale
 2 | import torch
 3 | import torch.optim as optim
 4 | import torch.nn.functional as F
 5 | from fairscale.nn.pipe.balance import balance_by_size
 6 | 
 7 | 
 8 | model = torch.nn.Sequential(
 9 |             torch.nn.Linear(10, 10),
10 |             torch.nn.ReLU(),
11 |             torch.nn.Linear(10, 5)
12 |         )
13 | 
14 | sample = torch.randn(20, 10)
15 | partitions = torch.cuda.device_count()
16 | balance = balance_by_size(partitions, model, sample)
17 | 
18 | model = fairscale.nn.Pipe(model, balance)
19 | optimizer = optim.SGD(model.parameters(), lr=0.001)
20 | loss_fn = F.nll_loss
21 | 
22 | optimizer.zero_grad()
23 | target = torch.randint(0,2,size=(20,1)).squeeze()
24 | data = torch.randn(20, 10)
25 | 
26 | device = model.devices[0]
27 | ## outputs and target need to be on the same device
28 | # forward step
29 | 
30 | # Training loop
31 | num_epochs = 10
32 | batch_size = 20
33 | 
34 | for epoch in range(num_epochs):
35 |     print(f"Epoch {epoch+1}\n-------------------------------")
36 |     
37 |     for _ in range(100): # Assuming 100 batches per epoch
38 |         # Generate random input data and target labels
39 |         data = torch.randn(batch_size, 10)
40 |         target = torch.randint(0, 5, size=(batch_size,))
41 |         
42 |         # Move data to the device of the first partition
43 |         device = model.devices[0]
44 |         data = data.to(device)
45 |         target = target.to(device)
46 | 
47 |         # Forward pass
48 |         optimizer.zero_grad()
49 |         outputs = model(data)
50 |         
51 |         # Compute loss
52 |         loss = loss_fn(outputs.to(device), target.to(device)) 
53 |         
54 |         # Backward pass and optimization step  
55 |         loss.backward()
56 |         optimizer.step()
57 | 
58 |         print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


--------------------------------------------------------------------------------
/parallel/inference.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ThreadPoolExecutor
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch.utils.data import Dataset, DataLoader, random_split
  5 | 
  6 | # Distributed training
  7 | from torch.utils.data.distributed import DistributedSampler
  8 | from torch.nn.parallel import DistributedDataParallel
  9 | from torch.distributed import init_process_group, destroy_process_group
 10 | 
 11 | import warnings
 12 | from tqdm import tqdm
 13 | import os
 14 | from pathlib import Path
 15 | import argparse
 16 | 
 17 | # Huggingface datasets and tokenizers
 18 | from datasets import load_dataset
 19 | from tokenizers import Tokenizer
 20 | from tokenizers.models import WordLevel
 21 | from tokenizers.trainers import WordLevelTrainer
 22 | from tokenizers.pre_tokenizers import Whitespace
 23 | 
 24 | import wandb
 25 | import torchmetrics
 26 | 
 27 | from model import build_transformer
 28 | from dataset import BilingualDataset, causal_mask
 29 | from config import get_default_config, get_weights_file_path, get_latest_weights_file_path, ModelConfig
 30 | 
 31 | def greedy_decode(model: nn.Module, source: torch.Tensor, source_mask: torch.Tensor, tokenizer_src: Tokenizer, tokenizer_tgt: Tokenizer, max_len: int, device: torch.device):
 32 |     sos_idx = tokenizer_tgt.token_to_id('[SOS]')
 33 |     eos_idx = tokenizer_tgt.token_to_id('[EOS]')
 34 | 
 35 |     # Precompute the encoder output and reuse it for every step
 36 |     encoder_output = model.encode(source, source_mask)
 37 |     # Initialize the decoder input with the sos token
 38 |     decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
 39 |     while True:
 40 |         if decoder_input.size(1) == max_len:
 41 |             break
 42 | 
 43 |         # build mask for target
 44 |         decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)
 45 | 
 46 |         # calculate output
 47 |         out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)
 48 | 
 49 |         # get next token
 50 |         prob = model.project(out[:, -1])
 51 |         _, next_word = torch.max(prob, dim=1)
 52 |         decoder_input = torch.cat(
 53 |             [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
 54 |         )
 55 | 
 56 |         if next_word == eos_idx:
 57 |             break
 58 | 
 59 |     return decoder_input.squeeze(0)
 60 | 
 61 | def run_inference(model: nn.Module, tokenizer_src: Tokenizer, tokenizer_tgt: Tokenizer, source_texts: list[str], max_len: int, device: torch.device):
 62 |     model.eval()
 63 |     model.to(device)
 64 | 
 65 |     def infer(source_text):
 66 |         source_ids = tokenizer_src.encode(source_text).ids
 67 |         source_tensor = torch.tensor(source_ids).unsqueeze(0).to(device)
 68 |         source_mask = torch.ones(1, 1, 1, len(source_ids)).to(device)
 69 | 
 70 |         with torch.no_grad():
 71 |             output_ids = greedy_decode(model, source_tensor, source_mask, tokenizer_src, tokenizer_tgt, max_len, device)
 72 |         
 73 |         output_text = tokenizer_tgt.decode(output_ids.detach().cpu().numpy())
 74 |         print("OUTPUT:", output_text)
 75 |         return output_text
 76 | 
 77 |     with ThreadPoolExecutor() as executor:
 78 |         outputs = list(executor.map(infer, source_texts))
 79 | 
 80 |     return outputs
 81 | 
 82 | def get_all_sentences(ds: Dataset, lang: str):
 83 |     for item in ds:
 84 |         yield item['translation'][lang]
 85 | 
 86 | def get_or_build_tokenizer(config: ModelConfig, ds: Dataset, lang: str) -> Tokenizer:
 87 |     tokenizer_path = Path(config.tokenizer_file.format(lang))
 88 |     if not Path.exists(tokenizer_path):
 89 |         # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
 90 |         tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
 91 |         tokenizer.pre_tokenizer = Whitespace()
 92 |         trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
 93 |         tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
 94 |         tokenizer.save(str(tokenizer_path))
 95 |     else:
 96 |         tokenizer = Tokenizer.from_file(str(tokenizer_path))
 97 |     return tokenizer
 98 | 
 99 | def get_model(config: ModelConfig, vocab_src_len: int, vocab_tgt_len: int):
100 |     model = build_transformer(vocab_src_len, vocab_tgt_len, config.seq_len, config.seq_len, d_model=config.d_model)
101 |     return model
102 | 
103 | def load_model(config: ModelConfig, tokenizer_src: Tokenizer, tokenizer_tgt: Tokenizer, device: torch.device, load_weights=False):
104 |     model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
105 |     if not load_weights:
106 |         return model
107 | 
108 |     if config.preload != '':
109 |         if config.preload == 'latest':
110 |             model_filename = get_latest_weights_file_path(config)
111 |         else:
112 |             model_filename = get_weights_file_path(config, int(config.preload))
113 | 
114 |         if model_filename is not None:
115 |             print(f'Loading model {model_filename}')
116 |             state = torch.load(model_filename, map_location=device)
117 |             model.load_state_dict(state['model_state_dict'])
118 |         else:
119 |             raise ValueError(f'Could not find model to preload: {config.preload}')
120 | 
121 |     return model
122 | 
123 | def run_inference_pipeline(config: ModelConfig, prompts: list[str]):
124 |     # Define the device
125 |     assert torch.cuda.is_available(), "Inference on CPU is not supported"
126 |     device = torch.device("cuda")
127 |     print(f"Using device: {device}")
128 | 
129 |     # Load the tokenizers
130 |     print("Loading tokenizers...")
131 |     ds_raw = load_dataset('opus_books', f"{config.lang_src}-{config.lang_tgt}", split='train')
132 |     tokenizer_src = get_or_build_tokenizer(config, ds_raw, config.lang_src)
133 |     tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config.lang_tgt)
134 | 
135 |     # Load the model
136 |     print("Loading model...")
137 |     model = load_model(config, tokenizer_src, tokenizer_tgt, device)
138 | 
139 |     output_texts = run_inference(model, tokenizer_src, tokenizer_tgt, prompts, config.seq_len, device)
140 |     return output_texts
141 | 
142 | if __name__ == '__main__':
143 |     warnings.filterwarnings("ignore")
144 | 
145 |     config = get_default_config()
146 | 
147 |     # Read command line arguments and overwrite config accordingly
148 |     parser = argparse.ArgumentParser()
149 |     parser.add_argument('--seq_len', type=int, default=config.seq_len)
150 |     parser.add_argument('--d_model', type=int, default=config.d_model)
151 |     parser.add_argument('--lang_src', type=str, default=config.lang_src)
152 |     parser.add_argument('--lang_tgt', type=str, default=config.lang_tgt)
153 |     parser.add_argument('--model_folder', type=str, default=config.model_folder)
154 |     parser.add_argument('--model_basename', type=str, default=config.model_basename)
155 |     parser.add_argument('--preload', type=str, default=config.preload)
156 |     parser.add_argument('--tokenizer_file', type=str, default=config.tokenizer_file)
157 |     args = parser.parse_args()
158 | 
159 |     # Update default configuration with command line arguments
160 |     config.__dict__.update(vars(args))
161 | 
162 |     # Run the inference pipeline
163 |     run_inference_pipeline(config)
164 | 


--------------------------------------------------------------------------------
/parallel/interface.txt:
--------------------------------------------------------------------------------
1 | https://66.114.112.70-44033.proxy.runpod.net
2 | https://wzdj2qmf354vu5-4000.proxy.runpod.net


--------------------------------------------------------------------------------
/parallel/main.py:
--------------------------------------------------------------------------------
 1 | import runpod
 2 | import pandas as pd
 3 | 
 4 | from config import get_default_config
 5 | from inference import run_inference_pipeline
 6 | from supabase import create_client, Client
 7 | from io import StringIO, BytesIO
 8 | import datetime
 9 | import os
10 | 
11 | 
12 | EVAL_BUCKET_NAME = "public"
13 | RESULT_BUCKET_NAME = "eval_results"
14 | supabase: Client = create_client(
15 |     os.environ.get("SUPABASE_URL"),
16 |     os.environ.get("SUPABASE_KEY"),
17 | )
18 | 
19 | 
20 | def upload_dataframe_to_supabase(df: pd.DataFrame, bucket_name: str):
21 |     csv_buffer = StringIO()
22 |     df.to_csv(csv_buffer, index=False)
23 |     csv_buffer.seek(0)
24 | 
25 |     file_content = BytesIO(csv_buffer.getvalue().encode("utf-8"))
26 | 
27 |     timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
28 |     file_name = f"eval-{timestamp}.csv"
29 | 
30 |     response = supabase.storage.from_(bucket_name).upload(file_name, file_content)
31 | 
32 |     return response
33 | 
34 | 
35 | async def handler(job):
36 |     job_input = job["input"]
37 | 
38 |     if not job_input.get("eval_file", False):
39 |         return {"error": "Invalid input, missing eval_file"}
40 | 
41 |     batch_size = job_input.get("batch_size", 20)
42 | 
43 |     # supabase file url
44 |     eval_file = job_input["eval_file"]
45 |     eval_file = supabase.storage.from_(EVAL_BUCKET_NAME).download(eval_file)
46 | 
47 |     file_like_object = BytesIO(eval_file)
48 | 
49 |     eval_df = pd.read_csv(file_like_object)
50 |     config = get_default_config()
51 | 
52 |     all_outputs = []
53 | 
54 |     batches = []
55 |     batch = []
56 | 
57 |     for index, row in eval_df.iterrows():
58 |         batch.append(row["prompt"])
59 |         if len(batch) == batch_size:
60 |             batches.append(batch)
61 |             batch = []
62 | 
63 |     if len(batch) > 0:
64 |         batches.append(batch)
65 | 
66 |     for batch in batches:
67 |         outputs = run_inference_pipeline(config, batch)
68 |         all_outputs.extend(outputs)
69 | 
70 |     eval_df["model output"] = all_outputs
71 |     upload_dataframe_to_supabase(eval_df, RESULT_BUCKET_NAME)
72 | 
73 |     for index, row in eval_df.iterrows():
74 |         print("=" * 12)
75 |         print("Prompt: ", row["prompt"])
76 |         print("Model Translation: ", all_outputs[index])
77 |         print("Expected Translation: ", row["answer"], "\n")
78 | 
79 |     return {"status": "success"}
80 | 
81 | 
82 | # Configure and start the RunPod serverless function
83 | runpod.serverless.start(
84 |     {
85 |         "handler": handler,  # Required: Specify the async handler
86 |         "return_aggregate_stream": True,  # Optional: Aggregate results are accessible via /run endpoint
87 |     }
88 | )
89 | 


--------------------------------------------------------------------------------
/parallel/mini_transformer_test.py:
--------------------------------------------------------------------------------
 1 | import fairscale
 2 | import torch
 3 | import torch.optim as optim
 4 | import torch.nn.functional as F
 5 | from fairscale.nn.pipe.balance import balance_by_size
 6 | 
 7 | # define the model
 8 | model = torch.nn.Sequential(
 9 |             torch.nn.Linear(10, 10),
10 |             torch.nn.ReLU(),
11 |             torch.nn.Linear(10, 5)
12 |         )
13 | 
14 | # define a sample input and use it to balance the model by size
15 | sample = torch.randn(20, 10)
16 | partitions = torch.cuda.device_count()
17 | balance = balance_by_size(partitions, model, sample)
18 | 
19 | # create a piped model
20 | model = fairscale.nn.Pipe(model, balance)
21 | optimizer = optim.SGD(model.parameters(), lr=0.001)
22 | loss_fn = F.nll_loss
23 | 
24 | # zero the gradients
25 | optimizer.zero_grad()
26 | 
27 | # get the target and data
28 | target = torch.randint(0,2,size=(20,1)).squeeze()
29 | data = torch.randn(20, 10)
30 | 
31 | device = model.devices[0]
32 | ## outputs and target need to be on the same device
33 | # forward step
34 | 
35 | # Training loop
36 | num_epochs = 10
37 | batch_size = 20
38 | 
39 | for epoch in range(num_epochs):
40 |     print(f"Epoch {epoch+1}\n-------------------------------")
41 |     
42 |     for _ in range(100): # Assuming 100 batches per epoch
43 |         # Generate random input data and target labels
44 |         data = torch.randn(batch_size, 10)
45 |         target = torch.randint(0, 5, size=(batch_size,))
46 |         
47 |         # Move data to the device of the first partition
48 |         device = model.devices[0]
49 |         data = data.to(device)
50 |         target = target.to(device)
51 | 
52 |         # Forward pass
53 |         optimizer.zero_grad()
54 |         outputs = model(data)
55 |         
56 |         # Compute loss
57 |         loss = loss_fn(outputs.to(device), target.to(device)) 
58 |         
59 |         # Backward pass and optimization step  
60 |         loss.backward()
61 |         optimizer.step()
62 | 
63 |         print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


--------------------------------------------------------------------------------
/parallel/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import math
  4 | 
  5 | class LayerNormalization(nn.Module):
  6 | 
  7 |     def __init__(self, features: int, eps:float=10**-6) -> None:
  8 |         super().__init__()
  9 |         self.eps = eps
 10 |         self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
 11 |         self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter
 12 | 
 13 |     def forward(self, x):
 14 |         # x: (batch, seq_len, hidden_size)
 15 |          # Keep the dimension for broadcasting
 16 |         mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
 17 |         # Keep the dimension for broadcasting
 18 |         std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
 19 |         # eps is to prevent dividing by zero or when std is very small
 20 |         return self.alpha * (x - mean) / (std + self.eps) + self.bias
 21 | 
 22 | class FeedForwardBlock(nn.Module):
 23 | 
 24 |     def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
 25 |         super().__init__()
 26 |         self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
 27 |         self.dropout = nn.Dropout(dropout)
 28 |         self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2
 29 | 
 30 |     def forward(self, x):
 31 |         # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
 32 |         return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))
 33 | 
 34 | class InputEmbeddings(nn.Module):
 35 | 
 36 |     def __init__(self, d_model: int, vocab_size: int) -> None:
 37 |         super().__init__()
 38 |         self.d_model = d_model
 39 |         self.vocab_size = vocab_size
 40 |         self.embedding = nn.Embedding(vocab_size, d_model)
 41 | 
 42 |     def forward(self, x):
 43 |         # (batch, seq_len) --> (batch, seq_len, d_model)
 44 |         # Multiply by sqrt(d_model) to scale the embeddings according to the paper
 45 |         return self.embedding(x) * math.sqrt(self.d_model)
 46 |     
 47 | class PositionalEncoding(nn.Module):
 48 | 
 49 |     def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
 50 |         super().__init__()
 51 |         self.d_model = d_model
 52 |         self.seq_len = seq_len
 53 |         self.dropout = nn.Dropout(dropout)
 54 |         # Create a matrix of shape (seq_len, d_model)
 55 |         pe = torch.zeros(seq_len, d_model)
 56 |         # Create a vector of shape (seq_len)
 57 |         position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
 58 |         # Create a vector of shape (d_model)
 59 |         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
 60 |         # Apply sine to even indices
 61 |         pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
 62 |         # Apply cosine to odd indices
 63 |         pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
 64 |         # Add a batch dimension to the positional encoding
 65 |         pe = pe.unsqueeze(0) # (1, seq_len, d_model)
 66 |         # Register the positional encoding as a buffer
 67 |         self.register_buffer('pe', pe)
 68 | 
 69 |     def forward(self, x):
 70 |         x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
 71 |         return self.dropout(x)
 72 | 
 73 | class ResidualConnection(nn.Module):
 74 |     
 75 |         def __init__(self, features: int, dropout: float) -> None:
 76 |             super().__init__()
 77 |             self.dropout = nn.Dropout(dropout)
 78 |             self.norm = LayerNormalization(features)
 79 |     
 80 |         def forward(self, x, sublayer):
 81 |             return x + self.dropout(sublayer(self.norm(x)))
 82 | 
 83 | class MultiHeadAttentionBlock(nn.Module):
 84 | 
 85 |     def __init__(self, d_model: int, h: int, dropout: float) -> None:
 86 |         super().__init__()
 87 |         self.d_model = d_model # Embedding vector size
 88 |         self.h = h # Number of heads
 89 |         # Make sure d_model is divisible by h
 90 |         assert d_model % h == 0, "d_model is not divisible by h"
 91 | 
 92 |         self.d_k = d_model // h # Dimension of vector seen by each head
 93 |         self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
 94 |         self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
 95 |         self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
 96 |         self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
 97 |         self.dropout = nn.Dropout(dropout)
 98 | 
 99 |     @staticmethod
100 |     def attention(query, key, value, mask, dropout: nn.Dropout):
101 |         d_k = query.shape[-1]
102 |         # Just apply the formula from the paper
103 |         # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
104 |         attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
105 |         if mask is not None:
106 |             # Write a very low value (indicating -inf) to the positions where mask == 0
107 |             attention_scores.masked_fill_(mask == 0, -1e9)
108 |         attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
109 |         if dropout is not None:
110 |             attention_scores = dropout(attention_scores)
111 |         # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
112 |         # return attention scores which can be used for visualization
113 |         return (attention_scores @ value), attention_scores
114 | 
115 |     def forward(self, q, k, v, mask):
116 |         query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
117 |         key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
118 |         value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
119 | 
120 |         # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
121 |         query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
122 |         key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
123 |         value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)
124 | 
125 |         # Calculate attention
126 |         x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
127 |         
128 |         # Combine all the heads together
129 |         # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
130 |         x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)
131 | 
132 |         # Multiply by Wo
133 |         # (batch, seq_len, d_model) --> (batch, seq_len, d_model)  
134 |         return self.w_o(x)
135 | 
136 | class EncoderBlock(nn.Module):
137 | 
138 |     def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
139 |         super().__init__()
140 |         self.self_attention_block = self_attention_block
141 |         self.feed_forward_block = feed_forward_block
142 |         self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])
143 | 
144 |     def forward(self, x, src_mask):
145 |         x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
146 |         x = self.residual_connections[1](x, self.feed_forward_block)
147 |         return x
148 |     
149 | class Encoder(nn.Module):
150 | 
151 |     def __init__(self, features: int, layers: nn.ModuleList) -> None:
152 |         super().__init__()
153 |         self.layers = layers
154 |         self.norm = LayerNormalization(features)
155 | 
156 |     def forward(self, x, mask):
157 |         for layer in self.layers:
158 |             x = layer(x, mask)
159 |         return self.norm(x)
160 | 
161 | class DecoderBlock(nn.Module):
162 | 
163 |     def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
164 |         super().__init__()
165 |         self.self_attention_block = self_attention_block
166 |         self.cross_attention_block = cross_attention_block
167 |         self.feed_forward_block = feed_forward_block
168 |         self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])
169 | 
170 |     def forward(self, x, encoder_output, src_mask, tgt_mask):
171 |         x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
172 |         x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
173 |         x = self.residual_connections[2](x, self.feed_forward_block)
174 |         return x
175 |     
176 | class Decoder(nn.Module):
177 | 
178 |     def __init__(self, features: int, layers: nn.ModuleList) -> None:
179 |         super().__init__()
180 |         self.layers = layers
181 |         self.norm = LayerNormalization(features)
182 | 
183 |     def forward(self, x, encoder_output, src_mask, tgt_mask):
184 |         for layer in self.layers:
185 |             x = layer(x, encoder_output, src_mask, tgt_mask)
186 |         return self.norm(x)
187 | 
188 | class ProjectionLayer(nn.Module):
189 | 
190 |     def __init__(self, d_model, vocab_size) -> None:
191 |         super().__init__()
192 |         self.proj = nn.Linear(d_model, vocab_size)
193 | 
194 |     def forward(self, x) -> None:
195 |         # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
196 |         return self.proj(x)
197 |     
198 | class Transformer(nn.Module):
199 | 
200 |     def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
201 |         super().__init__()
202 |         self.encoder = encoder
203 |         self.decoder = decoder
204 |         self.src_embed = src_embed
205 |         self.tgt_embed = tgt_embed
206 |         self.src_pos = src_pos
207 |         self.tgt_pos = tgt_pos
208 |         self.projection_layer = projection_layer
209 | 
210 |     def encode(self, src, src_mask):
211 |         # (batch, seq_len, d_model)
212 |         src = self.src_embed(src)
213 |         src = self.src_pos(src)
214 |         return self.encoder(src, src_mask)
215 |     
216 |     def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
217 |         # (batch, seq_len, d_model)
218 |         tgt = self.tgt_embed(tgt)
219 |         tgt = self.tgt_pos(tgt)
220 |         return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
221 |     
222 |     def project(self, x):
223 |         # (batch, seq_len, vocab_size)
224 |         return self.projection_layer(x)
225 | 
226 |     def forward(self, encoder_input, encoder_mask, decoder_input, decoder_mask):      
227 |         encoder_output = self.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
228 |         decoder_output = self.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
229 |         return self.project(decoder_output) # (B, seq_len, vocab_size))
230 |     
231 | def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
232 |     # Create the embedding layers
233 |     src_embed = InputEmbeddings(d_model, src_vocab_size)
234 |     tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)
235 | 
236 |     # Create the positional encoding layers
237 |     src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
238 |     tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)
239 |     
240 |     # Create the encoder blocks
241 |     encoder_blocks = []
242 |     for _ in range(N):
243 |         encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
244 |         feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
245 |         encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
246 |         encoder_blocks.append(encoder_block)
247 | 
248 |     # Create the decoder blocks
249 |     decoder_blocks = []
250 |     for _ in range(N):
251 |         decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
252 |         decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
253 |         feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
254 |         decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
255 |         decoder_blocks.append(decoder_block)
256 |     
257 |     # Create the encoder and decoder
258 |     encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
259 |     decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))
260 |     
261 |     # Create the projection layer
262 |     projection_layer = ProjectionLayer(d_model, tgt_vocab_size)
263 |     
264 |     # Create the transformer
265 |     transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)
266 |     
267 |     # Initialize the parameters
268 |     for p in transformer.parameters():
269 |         if p.dim() > 1:
270 |             nn.init.xavier_uniform_(p)
271 |     
272 |     return transformer
273 | 
274 | 
275 | 


--------------------------------------------------------------------------------
/parallel/requirements.txt:
--------------------------------------------------------------------------------
 1 | ## Use python 3.9
 2 | 
 3 | torch==2.0.1
 4 | torchvision==0.15.2 
 5 | torchaudio==2.0.2
 6 | torchtext==0.15.2
 7 | datasets==2.15.0
 8 | tokenizers==0.13.3
 9 | torchmetrics==1.0.3
10 | altair==5.1.1
11 | wandb==0.15.9
12 | runpod==1.6.2
13 | pandas==2.2.2
14 | supabase==2.4.5


--------------------------------------------------------------------------------
/parallel/train.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.utils.data import Dataset, DataLoader, random_split
  4 | 
  5 | # Distributed training
  6 | from torch.utils.data.distributed import DistributedSampler
  7 | from torch.nn.parallel import DistributedDataParallel
  8 | from torch.distributed import init_process_group, destroy_process_group
  9 | 
 10 | import warnings
 11 | from tqdm import tqdm
 12 | import os
 13 | from pathlib import Path
 14 | import argparse
 15 | 
 16 | # Huggingface datasets and tokenizers
 17 | from datasets import load_dataset
 18 | from tokenizers import Tokenizer
 19 | from tokenizers.models import WordLevel
 20 | from tokenizers.trainers import WordLevelTrainer
 21 | from tokenizers.pre_tokenizers import Whitespace
 22 | 
 23 | import wandb
 24 | import torchmetrics
 25 | 
 26 | from model import build_transformer
 27 | from dataset import BilingualDataset, causal_mask
 28 | from config import get_default_config, get_weights_file_path, get_latest_weights_file_path, ModelConfig
 29 | 
 30 | def greedy_decode(model: nn.Module, source: torch.Tensor, source_mask: torch.Tensor, tokenizer_src: Tokenizer, tokenizer_tgt: Tokenizer, max_len: int, device: torch.device):
 31 |     sos_idx = tokenizer_tgt.token_to_id('[SOS]')
 32 |     eos_idx = tokenizer_tgt.token_to_id('[EOS]')
 33 | 
 34 |     # Precompute the encoder output and reuse it for every step
 35 |     encoder_output = model.module.encode(source, source_mask)
 36 |     # Initialize the decoder input with the sos token
 37 |     decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
 38 |     while True:
 39 |         if decoder_input.size(1) == max_len:
 40 |             break
 41 | 
 42 |         # build mask for target
 43 |         decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)
 44 | 
 45 |         # calculate output
 46 |         out = model.module.decode(encoder_output, source_mask, decoder_input, decoder_mask)
 47 | 
 48 |         # get next token
 49 |         prob = model.module.project(out[:, -1])
 50 |         _, next_word = torch.max(prob, dim=1)
 51 |         decoder_input = torch.cat(
 52 |             [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
 53 |         )
 54 | 
 55 |         if next_word == eos_idx:
 56 |             break
 57 | 
 58 |     return decoder_input.squeeze(0)
 59 | 
 60 | 
 61 | def run_validation(model: nn.Module, validation_ds: DataLoader, tokenizer_src: Tokenizer, tokenizer_tgt: Tokenizer, max_len: int, device: torch.device, print_msg: callable, global_step: int, num_examples: int = 2):
 62 |     model.eval()
 63 |     count = 0
 64 | 
 65 |     source_texts = []
 66 |     expected = []
 67 |     predicted = []
 68 | 
 69 |     try:
 70 |         # get the console window width
 71 |         with os.popen('stty size', 'r') as console:
 72 |             _, console_width = console.read().split()
 73 |             console_width = int(console_width)
 74 |     except:
 75 |         # If we can't get the console width, use 80 as default
 76 |         console_width = 80
 77 | 
 78 |     with torch.no_grad():
 79 |         for batch in validation_ds:
 80 |             count += 1
 81 |             encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
 82 |             encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)
 83 | 
 84 |             # check that the batch size is 1
 85 |             assert encoder_input.size(
 86 |                 0) == 1, "Batch size must be 1 for validation"
 87 | 
 88 |             model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)
 89 | 
 90 |             source_text = batch["src_text"][0]
 91 |             target_text = batch["tgt_text"][0]
 92 |             model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())
 93 | 
 94 |             source_texts.append(source_text)
 95 |             expected.append(target_text)
 96 |             predicted.append(model_out_text)
 97 |             
 98 |             # Print the source, target and model output
 99 |             print_msg('-'*console_width)
100 |             print_msg(f"{f'SOURCE: ':>12}{source_text}")
101 |             print_msg(f"{f'TARGET: ':>12}{target_text}")
102 |             print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")
103 | 
104 |             if count == num_examples:
105 |                 print_msg('-'*console_width)
106 |                 break
107 |     
108 |     
109 |     # Evaluate the character error rate
110 |     # Compute the char error rate 
111 |     metric = torchmetrics.CharErrorRate()
112 |     cer = metric(predicted, expected)
113 |     wandb.log({'validation/cer': cer, 'global_step': global_step})
114 | 
115 |     # Compute the word error rate
116 |     metric = torchmetrics.WordErrorRate()
117 |     wer = metric(predicted, expected)
118 |     wandb.log({'validation/wer': wer, 'global_step': global_step})
119 | 
120 |     # Compute the BLEU metric
121 |     metric = torchmetrics.BLEUScore()
122 |     bleu = metric(predicted, expected)
123 |     wandb.log({'validation/BLEU': bleu, 'global_step': global_step})
124 | 
125 | def get_all_sentences(ds: Dataset, lang: str):
126 |     for item in ds:
127 |         yield item['translation'][lang]
128 | 
129 | def get_or_build_tokenizer(config: ModelConfig, ds: Dataset, lang: str) -> Tokenizer:
130 |     tokenizer_path = Path(config.tokenizer_file.format(lang))
131 |     if not Path.exists(tokenizer_path):
132 |         # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
133 |         tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
134 |         tokenizer.pre_tokenizer = Whitespace()
135 |         trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
136 |         tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
137 |         tokenizer.save(str(tokenizer_path))
138 |     else:
139 |         tokenizer = Tokenizer.from_file(str(tokenizer_path))
140 |     return tokenizer
141 | 
142 | def get_ds(config: ModelConfig):
143 |     # It only has the train split, so we divide it overselves
144 |     ds_raw = load_dataset('opus_books', f"{config.lang_src}-{config.lang_tgt}", split='train')
145 | 
146 |     # Build tokenizers
147 |     print(f"GPU {config.local_rank} - Loading tokenizers...")
148 |     tokenizer_src = get_or_build_tokenizer(config, ds_raw, config.lang_src)
149 |     tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config.lang_tgt)
150 | 
151 |     # Keep 90% for training, 10% for validation
152 |     train_ds_size = int(0.9 * len(ds_raw))
153 |     val_ds_size = len(ds_raw) - train_ds_size
154 |     train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])
155 | 
156 |     train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config.lang_src, config.lang_tgt, config.seq_len)
157 |     val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config.lang_src, config.lang_tgt, config.seq_len)
158 | 
159 |     # Find the maximum length of each sentence in the source and target sentence
160 |     max_len_src = 0
161 |     max_len_tgt = 0
162 | 
163 |     for item in ds_raw:
164 |         src_ids = tokenizer_src.encode(item['translation'][config.lang_src]).ids
165 |         tgt_ids = tokenizer_tgt.encode(item['translation'][config.lang_tgt]).ids
166 |         max_len_src = max(max_len_src, len(src_ids))
167 |         max_len_tgt = max(max_len_tgt, len(tgt_ids))
168 | 
169 |     print(f'GPU {config.local_rank} - Max length of source sentence: {max_len_src}')
170 |     print(f'GPU {config.local_rank} - Max length of target sentence: {max_len_tgt}')
171 |     
172 | 
173 |     train_dataloader = DataLoader(train_ds, batch_size=config.batch_size, shuffle=False, sampler=DistributedSampler(train_ds, shuffle=True))
174 |     val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)
175 | 
176 |     return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt
177 | 
178 | def get_model(config: ModelConfig, vocab_src_len: int, vocab_tgt_len: int):
179 |     model = build_transformer(vocab_src_len, vocab_tgt_len, config.seq_len, config.seq_len, d_model=config.d_model)
180 |     return model
181 | 
182 | def train_model(config: ModelConfig):
183 |     # Define the device
184 |     assert torch.cuda.is_available(), "Training on CPU is not supported"
185 |     device = torch.device("cuda")
186 |     print(f"GPU {config.local_rank} - Using device: {device}")
187 | 
188 |     # Make sure the weights folder exists
189 |     Path(config.model_folder).mkdir(parents=True, exist_ok=True)
190 | 
191 |     # Load the dataset
192 |     print(f"GPU {config.local_rank} - Loading dataset...")
193 |     train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
194 |     model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
195 | 
196 |     optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, eps=1e-9)
197 | 
198 |     # By default, load the latest checkpoint
199 |     initial_epoch = 0
200 |     global_step = 0
201 |     wandb_run_id = None
202 |     if config.preload != '':
203 | 
204 |         if config.preload == 'latest':
205 |             # Get the filename of the latest checkpoint
206 |             model_filename = get_latest_weights_file_path(config)
207 |         else:
208 |             # In case we want to preload a specific checkpoint
209 |             model_filename = get_weights_file_path(config, int(config.preload))
210 | 
211 |         if model_filename is not None:
212 |             print(f'GPU {config.local_rank} - Preloading model {model_filename}')
213 |             state = torch.load(model_filename)
214 |             model.load_state_dict(state['model_state_dict'])
215 |             initial_epoch = state['epoch'] + 1
216 |             optimizer.load_state_dict(state['optimizer_state_dict'])
217 |             global_step = state['global_step']
218 |             wandb_run_id = state['wandb_run_id']
219 |             del state
220 |         else:
221 |             # If we couldn't find a model to preload, just start from scratch
222 |             print(f'GPU {config.local_rank} - Could not find model to preload: {config.preload}. Starting from scratch')
223 | 
224 |     # Only initialize W&B on the global rank 0 node
225 |     if config.local_rank == 0:
226 |         wandb.init(
227 |             # set the wandb project where this run will be logged
228 |             project="pytorch-transformer-distributed",
229 |             # allow resuming existing run with the same name (in case the rank 0 node crashed)
230 |             name=f"global_rank_{config.global_rank}",
231 |             id=wandb_run_id,
232 |             resume="allow",
233 |             group=config.wandb_group,
234 |             # track hyperparameters and run metadata
235 |             config=config
236 |         )
237 | 
238 |     # Convert the model to DistributedDataParallel
239 |     # Here we can also specify the bucket_cap_mb parameter to control the size of the buckets
240 |     model = DistributedDataParallel(model, device_ids=[config.local_rank])
241 | 
242 |     loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)
243 | 
244 |     if config.global_rank == 0:
245 |         # define our custom x axis metric
246 |         wandb.define_metric("global_step")
247 |         # define which metrics will be plotted against it
248 |         wandb.define_metric("validation/*", step_metric="global_step")
249 |         wandb.define_metric("train/*", step_metric="global_step")
250 | 
251 |     for epoch in range(initial_epoch, config.num_epochs):
252 |         torch.cuda.empty_cache()
253 |         model.train()
254 | 
255 |         # Disable tqdm on all nodes except the rank 0 GPU on each server
256 |         batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d} on rank {config.global_rank}", disable=config.local_rank != 0)
257 | 
258 |         for batch in batch_iterator:
259 |             encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
260 |             decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
261 |             encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
262 |             decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)
263 | 
264 |             # # Run the tensors through the encoder, decoder and the projection layer
265 |             # encoder_output = model.module.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
266 |             # decoder_output = model.module.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
267 |             # proj_output = model.module.project(decoder_output) # (B, seq_len, vocab_size)
268 |             proj_output = model(encoder_input, encoder_mask, decoder_input, decoder_mask)
269 | 
270 |             # Compare the output with the label
271 |             label = batch['label'].to(device) # (B, seq_len)
272 | 
273 |             # Compute the loss using a simple cross entropy
274 |             loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
275 |             batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}", "global_step": global_step})
276 |         
277 |             if config.local_rank == 0:
278 |                 # Log the loss
279 |                 wandb.log({'train/loss': loss.item(), 'global_step': global_step})
280 | 
281 |             # Backpropagate the loss
282 |             loss.backward()
283 | 
284 |             # Update the weights
285 |             optimizer.step()
286 |             optimizer.zero_grad(set_to_none=True)
287 | 
288 |             global_step += 1
289 | 
290 |         # Only run validation and checkpoint saving on the rank 0 node
291 |         if config.global_rank == 0:
292 |             # Run validation at the end of every epoch
293 |             run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config.seq_len, device, lambda msg: batch_iterator.write(msg), global_step)
294 | 
295 |             # Save the model at the end of every epoch
296 |             model_filename = get_weights_file_path(config, epoch)
297 |             torch.save({
298 |                 'epoch': epoch,
299 |                 'model_state_dict': model.module.state_dict(), # Need to access module because we are using DDP
300 |                 'optimizer_state_dict': optimizer.state_dict(),
301 |                 'global_step': global_step,
302 |                 'wandb_run_id': wandb.run.id # Save to resume logging data
303 |             }, model_filename)
304 | 
305 | 
306 | if __name__ == '__main__':
307 |     warnings.filterwarnings("ignore")
308 | 
309 |     # Disable tokenizers parallelism (this is to avoid deadlocks when creating the tokenizers on multiple GPUs)
310 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
311 | 
312 |     config = get_default_config()
313 | 
314 |     # Read command line arguments and overwrite config accordingly
315 |     parser = argparse.ArgumentParser()
316 |     parser.add_argument('--batch_size', type=int, default=config.batch_size)
317 |     parser.add_argument('--num_epochs', type=int, default=config.num_epochs)
318 |     parser.add_argument('--lr', type=float, default=config.lr)
319 |     parser.add_argument('--seq_len', type=int, default=config.seq_len)
320 |     parser.add_argument('--d_model', type=int, default=config.d_model)
321 |     parser.add_argument('--lang_src', type=str, default=config.lang_src)
322 |     parser.add_argument('--lang_tgt', type=str, default=config.lang_tgt)
323 |     parser.add_argument('--model_folder', type=str, default=config.model_folder)
324 |     parser.add_argument('--model_basename', type=str, default=config.model_basename)
325 |     parser.add_argument('--preload', type=str, default=config.preload)
326 |     parser.add_argument('--tokenizer_file', type=str, default=config.tokenizer_file)
327 |     parser.add_argument('--wandb_group', type=str, default="exp1")
328 |     args = parser.parse_args()
329 | 
330 |     # Update default configuration with command line arguments
331 |     config.__dict__.update(vars(args))
332 | 
333 |     # Add local rank and global rank to the config
334 |     config.local_rank = int(os.environ['LOCAL_RANK'])
335 |     config.global_rank = int(os.environ['RANK'])
336 | 
337 |     assert config.local_rank != -1, "LOCAL_RANK environment variable not set"
338 |     assert config.global_rank != -1, "RANK environment variable not set"
339 | 
340 |     # Print configuration (only once per server)
341 |     if config.local_rank == 0:
342 |         print("Configuration:")
343 |         for key, value in config.__dict__.items():
344 |             print(f"{key:>20}: {value}")
345 | 
346 |     # Setup distributed training
347 |     init_process_group(backend='nccl')
348 |     torch.cuda.set_device(config.local_rank)
349 |     
350 |     # Train the model
351 |     train_model(config)
352 | 
353 |     # Clean up distributed training
354 |     destroy_process_group()


--------------------------------------------------------------------------------
/scheduler/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/lambda/python:3.10
 2 | 
 3 | COPY requirements.txt ./
 4 | 
 5 | RUN pip3 install -r ./requirements.txt
 6 | 
 7 | COPY scheduler.py ./
 8 | 
 9 | CMD [ "scheduler.lambda_handler" ]
10 | 


--------------------------------------------------------------------------------
/scheduler/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3==1.34.94


--------------------------------------------------------------------------------
/scheduler/scheduler.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import json
 3 | 
 4 | 
 5 | def lambda_handler(event, context):
 6 |     client = boto3.client("events")
 7 |     rule_name = "one-min-run"
 8 |     action = event.get("queryStringParameters", {}).get("action", None)
 9 | 
10 |     print("ACTION", action)
11 | 
12 |     if action == "start":
13 |         # Enable the EventBridge rule
14 |         client.enable_rule(Name=rule_name)
15 |         response_body = {"message": "EventBridge rule enabled"}
16 |         status_code = 200
17 |     elif action == "stop":
18 |         # Disable the EventBridge rule
19 |         client.disable_rule(Name=rule_name)
20 |         response_body = {"message": "EventBridge rule disabled"}
21 |         status_code = 200
22 |     else:
23 |         response_body = {"message": "Invalid action parameter"}
24 |         status_code = 400
25 | 
26 |     return {
27 |         "statusCode": status_code,
28 |         "body": json.dumps(response_body),
29 |         "headers": {"Content-Type": "application/json"},
30 |     }
31 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | sudo apt-get update
 2 | sudo apt-get install net-tools
 3 | sudo apt-get install smbclient
 4 | sudo apt-get install cifs-utils
 5 | runpodctl config --apiKey QB36FXUTFOC314A1NRS8KQPHK9SBJA8AEQEYQ8LD
 6 | python3 -m venv /workspace/envs/distribute
 7 | source /workspace/envs/distribute/bin/activate
 8 | cd /workspace/runpod-hackthon/parallel/
 9 | # sudo mkdir /mnt/training-data
10 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | 
 3 | HOST = '0.0.0.0'  # Listen on all available interfaces
 4 | PORT = 48123       # Port to listen on (non-privileged ports are > 1023)
 5 | 
 6 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
 7 |     s.bind((HOST, PORT))
 8 |     s.listen()
 9 |     print(f"Server listening on port {PORT}")
10 |     conn, addr = s.accept()
11 |     with conn:
12 |         print('Connected by', addr)
13 |         while True:
14 |             data = conn.recv(1024)
15 |             print(data)
16 |             if not data:
17 |                 break
18 |             conn.sendall(data)


--------------------------------------------------------------------------------
/virtual_llm/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM runpod/worker-vllm:stable-cuda11.8.0
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y unrtf && \
 5 |     rm -rf /var/lib/apt/lists/*
 6 | 
 7 | COPY . .
 8 | 
 9 | RUN python3.10 -m pip install -q --no-cache-dir -r requirements.txt
10 | 
11 | CMD ["python3.10", "-u", "main.py"]
12 | 
13 | 


--------------------------------------------------------------------------------
/virtual_llm/main.py:
--------------------------------------------------------------------------------
  1 | import runpod
  2 | import pandas as pd
  3 | 
  4 | from supabase import create_client, Client
  5 | from io import StringIO, BytesIO
  6 | import datetime
  7 | import asyncio
  8 | from openai import AsyncOpenAI
  9 | import os
 10 | import asyncio
 11 | 
 12 | 
 13 | EVAL_BUCKET_NAME = "evals"
 14 | RESULT_BUCKET_NAME = "eval_results"
 15 | 
 16 | 
 17 | api_key = os.environ.get("RUNPOD_API_KEY")
 18 | endpoint_id = os.environ.get("RUNPOD_ENDPOINT_ID")
 19 | supabase: Client = create_client(
 20 |     os.environ.get("SUPABASE_URL"),
 21 |     os.environ.get("SUPABASE_KEY"),
 22 | )
 23 | 
 24 | 
 25 | client = AsyncOpenAI(
 26 |     api_key=api_key,
 27 |     base_url=f"https://api.runpod.ai/v2/{endpoint_id}/openai/v1",
 28 | )
 29 | 
 30 | 
 31 | async def run_evaluation_pipeline(model_name: str, batch: list[str]):
 32 |     tasks = []
 33 | 
 34 |     for prompt in batch:
 35 |         tasks.append(
 36 |             client.completions.create(
 37 |                 model=model_name,
 38 |                 prompt=prompt,
 39 |                 temperature=0.5,
 40 |                 max_tokens=100,
 41 |             )
 42 |         )
 43 | 
 44 |     outputs = await asyncio.gather(*tasks)
 45 |     return [output.choices[0].text for output in outputs]
 46 | 
 47 | 
 48 | def upload_dataframe_to_supabase(df: pd.DataFrame, bucket_name: str):
 49 |     csv_buffer = StringIO()
 50 |     df.to_csv(csv_buffer, index=False)
 51 |     csv_buffer.seek(0)
 52 | 
 53 |     file_content = BytesIO(csv_buffer.getvalue().encode("utf-8"))
 54 | 
 55 |     timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
 56 |     file_name = f"eval-{timestamp}.csv"
 57 | 
 58 |     response = supabase.storage.from_(bucket_name).upload(file_name, file_content)
 59 | 
 60 |     return response
 61 | 
 62 | 
 63 | async def handler(job):
 64 |     job_input = job["input"]
 65 | 
 66 |     if not job_input.get("eval_file", False) or not job_input.get("model_name", False):
 67 |         return {"error": "Invalid input, missing eval_file"}
 68 | 
 69 |     batch_size = job_input.get("batch_size", 20)
 70 | 
 71 |     # supabase file url
 72 |     eval_file = job_input["eval_file"]
 73 |     model_name = job_input.get("model_name")
 74 | 
 75 |     eval_file = supabase.storage.from_(EVAL_BUCKET_NAME).download(eval_file)
 76 | 
 77 |     file_like_object = BytesIO(eval_file)
 78 | 
 79 |     eval_df = pd.read_csv(file_like_object)
 80 | 
 81 |     all_outputs = []
 82 | 
 83 |     batches = []
 84 |     batch = []
 85 | 
 86 |     for index, row in eval_df.iterrows():
 87 |         batch.append(row["prompt"])
 88 |         if len(batch) == batch_size:
 89 |             batches.append(batch)
 90 |             batch = []
 91 | 
 92 |     if len(batch) > 0:
 93 |         batches.append(batch)
 94 | 
 95 |     for batch in batches:
 96 |         outputs = await run_evaluation_pipeline(model_name, batch)
 97 |         all_outputs.extend(outputs)
 98 | 
 99 |     eval_df["model output"] = all_outputs
100 |     upload_dataframe_to_supabase(eval_df, RESULT_BUCKET_NAME)
101 | 
102 |     for index, row in eval_df.iterrows():
103 |         print("=" * 12)
104 |         print("Prompt: ", row["prompt"])
105 |         print("Model Translation: ", all_outputs[index])
106 |         print("Expected Translation: ", row["answer"], "\n")
107 | 
108 |     return {"status": "success"}
109 | 
110 | 
111 | # Configure and start the RunPod serverless function
112 | runpod.serverless.start(
113 |     {
114 |         "handler": handler,  # Required: Specify the async handler
115 |         "return_aggregate_stream": True,  # Optional: Aggregate results are accessible via /run endpoint
116 |     }
117 | )
118 | 


--------------------------------------------------------------------------------
/virtual_llm/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.31.0
2 | asyncio==3.4.3
3 | openai==1.30.1
4 | pandas==2.2.2
5 | supabase==2.4.5
6 | runpod==1.6.2
7 | 


--------------------------------------------------------------------------------