5 | tag: "latest"
6 | pullPolicy: IfNotPresent
7 |
8 | service:
9 | fastapi:
10 | type: ClusterIP
11 | port: 3001
12 | redis:
13 | type: ClusterIP
14 | port: 6379
15 | flower:
16 | type: ClusterIP
17 | port: 5555
18 |
19 | env:
20 | WATSONX_URL: "https://us-south.ml.cloud.ibm.com"
21 | WX_PROJECT_ID: ""
22 | IBM_CLOUD_API_KEY: ""
23 | CELERY_BROKER_URL: "redis://redis:6379/0"
24 | CELERY_RESULT_BACKEND: "redis://redis:6379/0"
25 |
26 | resources: {}
27 |
--------------------------------------------------------------------------------
/JudgeIt-App/app/api/auth/[...nextauth]/route.js:
--------------------------------------------------------------------------------
1 | import NextAuth from "next-auth";
2 | import Auth0Provider from "next-auth/providers/auth0";
3 |
4 | export const authOptions = {
5 | providers: [
6 | Auth0Provider({
7 | issuer: `${process.env.OAUTH_ISSUER_URL}`,
8 | clientId:`${process.env.OAUTH_CLIENT_ID}`,
9 | clientSecret: `${process.env.OAUTH_CLIENT_SECRET}`,
10 | id: 'IBMid',
11 | name: 'IBMid',
12 | }),
13 | ],
14 | pages: {
15 | signIn: "/signin"
16 | }
17 | }
18 |
19 | const handler = NextAuth(authOptions);
20 |
21 | export { handler as GET, handler as POST };
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/EvaluationTypeLabel.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import { Tooltip } from "@mui/material";
3 | import InfoOutlinedIcon from "@mui/icons-material/InfoOutlined";
4 |
5 | const EvaluationTypeLabel = ({ label, tooltip }) => {
6 | return (
7 |
8 | {label}
9 |
16 |
17 |
18 |
19 | );
20 | };
21 |
22 | export default EvaluationTypeLabel;
23 |
--------------------------------------------------------------------------------
/REST-Service/Dockerfile:
--------------------------------------------------------------------------------
1 | # Use the official Selenium Standalone Chrome image as the base image
2 | FROM registry.access.redhat.com/ubi8/python-311:latest
3 |
4 | # Set the working directory inside the container
5 | WORKDIR /app/backend
6 |
7 | # Copy the requirements file to the container and install dependencies
8 | COPY requirements.txt requirements.txt
9 | RUN pip3 install -r requirements.txt
10 | RUN pip3 install bson
11 | RUN pip3 install pymongo
12 |
13 | # Copy your FastAPI Python script to the container
14 | COPY main.py main.py
15 | COPY app/ app/
16 | COPY cert/ cert/
17 |
18 | EXPOSE 3001
19 |
20 | # Set the command to run your Python script
21 | CMD ["python3", "main.py"]
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/LinearProgressWithLabel.jsx:
--------------------------------------------------------------------------------
1 | import LinearProgress from '@mui/material/LinearProgress';
2 | import Typography from '@mui/material/Typography';
3 | import Box from '@mui/material/Box';
4 |
5 | export default function LinearProgressWithLabel({ value, width }) {
6 | return (
7 |
8 |
9 |
10 |
11 |
12 | {`${Math.round(
13 | value,
14 | )}%`}
15 |
16 |
17 | );
18 | }
--------------------------------------------------------------------------------
/JudgeIt-App/utils/sessionTokenAccessor.js:
--------------------------------------------------------------------------------
1 | import { getServerSession } from "next-auth";
2 | import { authOptions } from "../app/api/auth/[...nextauth]/route";
3 | import { decrypt } from "./encryption";
4 |
5 | export async function getAccessToken() {
6 |
7 | const session = await getServerSession(authOptions);
8 | if(session){
9 | const accessTokenDecrypted = decrypt(session.access_token)
10 | return accessTokenDecrypted;
11 | }
12 | return null;
13 | }
14 |
15 | export async function getIdToken() {
16 |
17 | const session = await getServerSession(authOptions);
18 | if(session){
19 | const idTokenDecrypted = decrypt(session.id_token)
20 | return idTokenDecrypted;
21 | }
22 | return null;
23 | }
--------------------------------------------------------------------------------
/REST-Service/app/route/root/routes.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter, requests as request
2 | from fastapi.responses import HTMLResponse
3 |
4 | root_api_route = APIRouter()
5 |
6 | API_PREFIX = "/"
7 | ## This routes returns the text to SQL from a given context and a sql query
8 | @root_api_route.get(API_PREFIX)
9 | def root_api():
10 | return HTMLResponse(
11 | """
12 |
13 |
14 | LLM Judge service
15 |
16 |
17 | LLM Judge service!
18 | For complete API visit open API docs
19 |
20 |
21 | """
22 | )
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | certifi==2024.07.04
2 | chardet==5.2.0
3 | charset-normalizer==3.3.2
4 | click==8.1.7
5 | ibm-cos-sdk==2.13.5
6 | ibm-cos-sdk-core==2.13.5
7 | ibm-cos-sdk-s3transfer==2.13.5
8 | ibm_watsonx_ai==1.0.10
9 | idna==3.7
10 | importlib_metadata==8.0.0
11 | jmespath==1.0.1
12 | joblib==1.4.2
13 | langchain-ibm==0.1.12
14 | lomond==0.3.3
15 | nltk==3.8.1
16 | numpy==1.26.4
17 | openpyxl==3.1.5
18 | packaging==24.1
19 | pandas==2.1.4
20 | python-dateutil==2.9.0.post0
21 | pytz==2024.1
22 | regex==2024.5.15
23 | requests==2.32.3
24 | rouge==1.0.1
25 | scikit-learn==1.5.0
26 | scipy==1.14.0
27 | six==1.16.0
28 | tabulate==0.9.0
29 | threadpoolctl==3.5.0
30 | tqdm==4.66.4
31 | tzdata==2024.1
32 | urllib3==2.1.0
33 | XlsxWriter==3.2.0
34 | zipp==3.19.2
35 |
--------------------------------------------------------------------------------
/REST-Service/deployment/base/kustomization.yaml:
--------------------------------------------------------------------------------
1 | kind: Kustomization
2 | images:
3 | - name: backend-image-name
4 | newName: image-registry.openshift-image-registry.svc:5000/llm-judge-dev/backend
5 | newTag: v1.0
6 | secretGenerator:
7 | - name: llm-judge-secret
8 | literals:
9 | - WATSONX_URL=
10 | - WX_PROJECT_ID=
11 | - IBM_CLOUD_API_KEY=
12 | - LLM_JUDGE_API_KEY=JudgeIt-Secret-Api-Key
13 | - WX_PLATFORM=saas
14 | - WX_USER=
15 | - CELERY_BROKER_URL=redis://redis:6379/0
16 | - CELERY_RESULT_BACKEND=redis://redis:6379/0
17 | - SERVER_URL=
18 | - MONGO_URL=
19 | - MONGO_USER=
20 | - MONGO_PASS=
21 | - MONGO_DB="judgeit_app"
22 | resources:
23 | - redis/
24 | - celery-worker/
25 | - flower/
26 | - rest-app/
--------------------------------------------------------------------------------
/JudgeIt-App/app/api/auth/logout/route.js:
--------------------------------------------------------------------------------
1 | import { authOptions } from "../[...nextauth]/route";
2 | import { getServerSession } from "next-auth"
3 | import { getIdToken } from "@/utils/sessionTokenAccessor";
4 |
5 | export async function GET() {
6 | const session = await getServerSession(authOptions);
7 |
8 | if (session) {
9 |
10 | const idToken = await getIdToken();
11 |
12 | // this will log out the user on Keycloak side
13 | var url = `${process.env.END_SESSION_URL}?id_token_hint=${idToken}&post_logout_redirect_uri=${encodeURIComponent(process.env.NEXTAUTH_URL)}`;
14 |
15 | try {
16 | const resp = await fetch(url, { method: "GET" });
17 | } catch (err) {
18 | console.error(err);
19 | return new Response({ status: 500 });
20 | }
21 | }
22 | return new Response({ status: 200 });
23 | }
--------------------------------------------------------------------------------
/JudgeIt-App/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM node:22.4.1-alpine AS deps
2 | #RUN apk add --no-cache libc6-compat=1.2.4-r2
3 | WORKDIR /app
4 |
5 | COPY package.json ./
6 | COPY package-lock.json ./
7 | RUN npm install
8 |
9 | FROM node:22.4.1-alpine AS builder
10 | WORKDIR /app
11 | COPY --from=deps /app/node_modules ./node_modules
12 | COPY . .
13 |
14 | RUN npm run build
15 |
16 | FROM node:22.4.1-alpine AS runner
17 | WORKDIR /app
18 |
19 | ENV NODE_ENV production
20 | ENV NEXT_TELEMETRY_DISABLED 1
21 |
22 | RUN addgroup --system --gid 1001 nodejs
23 | RUN adduser --system --uid 1001 nextjs
24 |
25 | COPY --from=builder --chown=nextjs:nodejs /app/.next ./.next
26 | COPY --from=builder /app/node_modules ./node_modules
27 | COPY --from=builder /app/package.json ./package.json
28 |
29 | USER nextjs
30 |
31 | EXPOSE 3000
32 |
33 | ENV PORT 3000
34 |
35 | CMD ["npm", "start"]
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/Footer.jsx:
--------------------------------------------------------------------------------
1 | function Footer() {
2 | return (
3 |
23 | );
24 | }
25 |
26 | export default Footer;
27 |
--------------------------------------------------------------------------------
/JudgeIt-App/deployment/deployment.yaml:
--------------------------------------------------------------------------------
1 | kind: Deployment
2 | apiVersion: apps/v1
3 | metadata:
4 | resourceVersion: '108957306'
5 | name: llm-judge-frontend
6 | labels:
7 | app: llm-judge-frontend
8 | spec:
9 | replicas: 1
10 | selector:
11 | matchLabels:
12 | app: llm-judge-frontend
13 | template:
14 | metadata:
15 | labels:
16 | app: llm-judge-frontend
17 | deployment: llm-judge-frontend
18 | annotations:
19 | openshift.io/generated-by: OpenShiftWebConsole
20 | spec:
21 | containers:
22 | - name: llm-judge-frontend
23 | image: 'image-registry.openshift-image-registry.svc:5000/llm-judge/llm-judge-frontend@sha256:5ac9b1aa09123b4d09a7e0f297e542c895350f7a700779b36df77b0897f45f46'
24 | ports:
25 | - containerPort: 3000
26 | protocol: TCP
27 | envFrom:
28 | - secretRef:
29 | name: llmjudge-frontend-secret
30 | resources: {}
31 |
--------------------------------------------------------------------------------
/REST-Service/deployment/base/redis/deployment.yaml:
--------------------------------------------------------------------------------
1 | kind: Deployment
2 | apiVersion: apps/v1
3 | metadata:
4 | name: redis
5 | labels:
6 | app: redis
7 | spec:
8 | replicas: 1
9 | selector:
10 | matchLabels:
11 | app: redis
12 | template:
13 | metadata:
14 | labels:
15 | app: redis
16 | deployment: redis
17 | annotations:
18 | openshift.io/generated-by: OpenShiftWebConsole
19 | spec:
20 | volumes:
21 | - name: redis-1
22 | emptyDir: {}
23 | containers:
24 | - name: redis
25 | image: redis:7.2.5-alpine
26 | ports:
27 | - containerPort: 6379
28 | protocol: TCP
29 | resources: {}
30 | volumeMounts:
31 | - name: redis-1
32 | mountPath: /data
33 | terminationMessagePath: /dev/termination-log
34 | terminationMessagePolicy: File
35 | imagePullPolicy: IfNotPresent
36 | restartPolicy: Always
37 |
--------------------------------------------------------------------------------
/JudgeIt-App/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "judge-app",
3 | "version": "0.1.0",
4 | "private": true,
5 | "scripts": {
6 | "dev": "next dev",
7 | "build": "next build",
8 | "start": "next start",
9 | "lint": "next lint"
10 | },
11 | "dependencies": {
12 | "@emotion/react": "^11.11.4",
13 | "@emotion/styled": "^11.11.5",
14 | "@mui/icons-material": "^5.16.0",
15 | "@mui/material": "^5.16.0",
16 | "@mui/x-data-grid": "^7.16.0",
17 | "axios": "^1.7.2",
18 | "chart.js": "^4.4.4",
19 | "chartjs-plugin-datalabels": "^2.2.0",
20 | "cryptr": "^6.3.0",
21 | "formik": "^2.4.6",
22 | "next": "14.2.5",
23 | "next-auth": "^4.24.7",
24 | "react": "^18",
25 | "react-chartjs-2": "^5.2.0",
26 | "react-dom": "^18",
27 | "react-dropzone": "^14.2.3",
28 | "react-pro-sidebar": "^1.1.0",
29 | "uuid": "^10.0.0",
30 | "yup": "^1.4.0"
31 | },
32 | "devDependencies": {
33 | "eslint": "^8",
34 | "eslint-config-next": "14.2.5"
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/REST-Service/app/src/config/TimeoutMiddleware.py:
--------------------------------------------------------------------------------
1 | from fastapi.responses import JSONResponse
2 | from starlette.middleware.base import BaseHTTPMiddleware
3 | from fastapi import FastAPI, Request, HTTPException
4 | import time
5 |
6 | class TimeoutMiddleware(BaseHTTPMiddleware):
7 | def __init__(self, app, timeout: int):
8 | super().__init__(app)
9 | self.timeout = timeout
10 |
11 | async def dispatch(self, request: Request, call_next):
12 | start_time = time.time()
13 | try:
14 | response = await call_next(request)
15 | process_time = time.time() - start_time
16 | if process_time > self.timeout:
17 | raise HTTPException(status_code=408, detail="Request Timeout")
18 | return response
19 | except Exception as e:
20 | process_time = time.time() - start_time
21 | if process_time > self.timeout:
22 | return JSONResponse(content={"detail": "Request Timeout"}, status_code=408)
23 | raise e
--------------------------------------------------------------------------------
/JudgeIt-App/app/layout.js:
--------------------------------------------------------------------------------
1 | import "../styles/globals.css";
2 | import Footer from "@/components/globals/Footer";
3 | import Topbar from "@/components/globals/Topbar";
4 | import { Grid, Box, AppBar } from "@mui/material";
5 | import SessionProviderWrapper from "@/utils/sessionProviderWrapper";
6 |
7 | export const metadata = {
8 | title: "LLM Judge Application",
9 | description: "LLM Judge Application to evaluate LLM response.",
10 | };
11 |
12 | export default function RootLayout({ children }) {
13 | return (
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 | {children}
22 |
23 |
24 |
25 |
26 |
27 |
28 | );
29 | }
30 |
--------------------------------------------------------------------------------
/REST-Service/chart/templates/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: fastapi-app
5 | labels:
6 | app: fastapi-app
7 | spec:
8 | type: {{ .Values.service.fastapi.type }}
9 | ports:
10 | - port: {{ .Values.service.fastapi.port }}
11 | targetPort: {{ .Values.service.fastapi.port }}
12 | selector:
13 | app: fastapi-app
14 |
15 | ---
16 |
17 | apiVersion: v1
18 | kind: Service
19 | metadata:
20 | name: redis
21 | labels:
22 | app: redis
23 | spec:
24 | type: {{ .Values.service.redis.type }}
25 | ports:
26 | - port: {{ .Values.service.redis.port }}
27 | targetPort: {{ .Values.service.redis.port }}
28 | selector:
29 | app: redis
30 |
31 | ---
32 |
33 | apiVersion: v1
34 | kind: Service
35 | metadata:
36 | name: flower
37 | labels:
38 | app: flower
39 | spec:
40 | type: {{ .Values.service.flower.type }}
41 | ports:
42 | - port: {{ .Values.service.flower.port }}
43 | targetPort: {{ .Values.service.flower.port }}
44 | selector:
45 | app: flower
46 |
--------------------------------------------------------------------------------
/REST-Service/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | uvicorn
3 | certifi==2024.6.2
4 | charset-normalizer==3.3.2
5 | click==8.1.7
6 | ibm-cos-sdk==2.13.5
7 | ibm-cos-sdk-core==2.13.5
8 | ibm-cos-sdk-s3transfer==2.13.5
9 | ibm_watson_machine_learning==1.0.359
10 | ibm_watsonx_ai==1.0.10
11 | idna==3.7
12 | importlib_metadata==8.0.0
13 | jmespath==1.0.1
14 | joblib==1.4.2
15 | lomond==0.3.3
16 | nltk==3.8.1
17 | numpy==1.26.4
18 | packaging==24.1
19 | pandas==2.1.4
20 | python-dateutil==2.9.0.post0
21 | pytz==2024.1
22 | regex==2024.5.15
23 | requests==2.32.4
24 | rouge==1.0.1
25 | scikit-learn==1.5.0
26 | scipy==1.14.0
27 | six==1.16.0
28 | tabulate==0.9.0
29 | threadpoolctl==3.5.0
30 | tqdm==4.66.4
31 | tzdata==2024.1
32 | urllib3==2.1.0
33 | zipp==3.19.2
34 | openpyxl==3.1.5
35 | langchain-ibm==0.1.10
36 | celery==5.4.0
37 | redis==5.0.7
38 | flower==2.0.1
39 | asyncio==3.4.3
40 | python-dotenv
41 | python-multipart
42 | fuzzywuzzy==0.18.0
43 | python-Levenshtein==0.27.1
44 | ibm-watsonx-gov==1.2.2
45 | Jinja2==3.1.2
46 | jsonschema==4.25.1
47 | unitxt==1.26.6
48 | textstat==0.7.10
--------------------------------------------------------------------------------
/JudgeIt-App/styles/globals.css:
--------------------------------------------------------------------------------
1 | @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500&family=Source+Sans+Pro:ital,wght@0,400;0,600;1,600&display=swap');
2 |
3 | html,
4 | body,
5 | #root,
6 | .app,
7 | .content {
8 | margin: 0;
9 | height: 100%;
10 | width: 100%;
11 | font-family: 'IBM Plex Sans';
12 | overflow: hidden;
13 | }
14 |
15 | .app {
16 | display: flex;
17 | position: relative;
18 | }
19 |
20 | ::-webkit-scrollbar {
21 | width: 10px;
22 | }
23 |
24 | /* Track */
25 |
26 | ::-webkit-scrollbar-track {
27 | background: #e0e0e0;
28 | }
29 |
30 | /* handle */
31 |
32 | ::-webkit-scrollbar-thumb {
33 | background: #888;
34 | }
35 |
36 | /* handle on Hover */
37 |
38 | ::-webkit-scrollbar-track:hover {
39 | background: #555;
40 | }
41 |
42 | .drag-and-drop {
43 | width: 100%;
44 | height: 200px;
45 | border: 2px dashed #ccc;
46 | border-radius: 5px;
47 | display: flex;
48 | justify-content: center;
49 | align-items: center;
50 | cursor: pointer;
51 | }
52 |
53 | .dragging {
54 | background-color: #f1f1f1;
55 | }
--------------------------------------------------------------------------------
/REST-Service/deployment/base/flower/deployment.yaml:
--------------------------------------------------------------------------------
1 | kind: Deployment
2 | apiVersion: apps/v1
3 | metadata:
4 | name: flower-app
5 | labels:
6 | app: flower-app
7 | spec:
8 | replicas: 1
9 | selector:
10 | matchLabels:
11 | app: flower-app
12 | template:
13 | metadata:
14 | labels:
15 | app: flower-app
16 | deployment: flower-app
17 | spec:
18 | containers:
19 | - resources: {}
20 | terminationMessagePath: /dev/termination-log
21 | name: flower-app
22 | command:
23 | - celery
24 | - '--broker=redis://redis:6379/0'
25 | - flower
26 | - '--port=5555'
27 | ports:
28 | - containerPort: 5555
29 | protocol: TCP
30 | - containerPort: 8080
31 | protocol: TCP
32 | imagePullPolicy: IfNotPresent
33 | terminationMessagePolicy: File
34 | envFrom:
35 | - secretRef:
36 | name: llm-judge-secret
37 | image: backend-image-name:latest
38 |
--------------------------------------------------------------------------------
/REST-Service/cert/mongo.crt:
--------------------------------------------------------------------------------
1 | -----BEGIN CERTIFICATE-----
2 | MIIDDzCCAfegAwIBAgIJANEH58y2/kzHMA0GCSqGSIb3DQEBCwUAMB4xHDAaBgNV
3 | BAMME0lCTSBDbG91ZCBEYXRhYmFzZXMwHhcNMTgwNjI1MTQyOTAwWhcNMjgwNjIy
4 | MTQyOTAwWjAeMRwwGgYDVQQDDBNJQk0gQ2xvdWQgRGF0YWJhc2VzMIIBIjANBgkq
5 | hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA8lpaQGzcFdGqeMlmqjffMPpIQhqpd8qJ
6 | Pr3bIkrXJbTcJJ9uIckSUcCjw4Z/rSg8nnT13SCcOl+1to+7kdMiU8qOWKiceYZ5
7 | y+yZYfCkGaiZVfazQBm45zBtFWv+AB/8hfCTdNF7VY4spaA3oBE2aS7OANNSRZSK
8 | pwy24IUgUcILJW+mcvW80Vx+GXRfD9Ytt6PRJgBhYuUBpgzvngmCMGBn+l2KNiSf
9 | weovYDCD6Vngl2+6W9QFAFtWXWgF3iDQD5nl/n4mripMSX6UG/n6657u7TDdgkvA
10 | 1eKI2FLzYKpoKBe5rcnrM7nHgNc/nCdEs5JecHb1dHv1QfPm6pzIxwIDAQABo1Aw
11 | TjAdBgNVHQ4EFgQUK3+XZo1wyKs+DEoYXbHruwSpXjgwHwYDVR0jBBgwFoAUK3+X
12 | Zo1wyKs+DEoYXbHruwSpXjgwDAYDVR0TBAUwAwEB/zANBgkqhkiG9w0BAQsFAAOC
13 | AQEAJf5dvlzUpqaix26qJEuqFG0IP57QQI5TCRJ6Xt/supRHo63eDvKw8zR7tlWQ
14 | lV5P0N2xwuSl9ZqAJt7/k/3ZeB+nYwPoyO3KvKvATunRvlPBn4FWVXeaPsG+7fhS
15 | qsejmkyonYw77HRzGOzJH4Zg8UN6mfpbaWSsyaExvqknCp9SoTQP3D67AzWqb1zY
16 | doqqgGIZ2nxCkp5/FXxF/TMb55vteTQwfgBy60jVVkbF7eVOWCv0KaNHPF5hrqbN
17 | i+3XjJ7/peF3xMvTMoy35DcT3E2ZeSVjouZs15O90kI3k2daS2OHJABW0vSj4nLz
18 | +PQzp/B9cQmOO8dCe049Q3oaUA==
19 | -----END CERTIFICATE-----
20 |
21 |
--------------------------------------------------------------------------------
/REST-Service/deployment/base/celery-worker/deployment.yaml:
--------------------------------------------------------------------------------
1 | kind: Deployment
2 | apiVersion: apps/v1
3 | metadata:
4 | name: celery-worker
5 | labels:
6 | app: celery-worker
7 | spec:
8 | replicas: 1
9 | selector:
10 | matchLabels:
11 | app: celery-worker
12 | template:
13 | metadata:
14 | labels:
15 | app: celery-worker
16 | deployment: celery-worker
17 | annotations:
18 | openshift.io/generated-by: OpenShiftWebConsole
19 | spec:
20 | containers:
21 | - resources: {}
22 | terminationMessagePath: /dev/termination-log
23 | name: celery-worker
24 | command:
25 | - celery
26 | - '-A'
27 | - app.celery.celery_worker.celery
28 | - worker
29 | - '--loglevel=info'
30 | ports:
31 | - containerPort: 3001
32 | protocol: TCP
33 | - containerPort: 8080
34 | protocol: TCP
35 | imagePullPolicy: IfNotPresent
36 | terminationMessagePolicy: File
37 | envFrom:
38 | - secretRef:
39 | name: llm-judge-secret
40 | image: backend-image-name:latest
41 |
--------------------------------------------------------------------------------
/REST-Service/deployment/base/rest-app/deployment.yaml:
--------------------------------------------------------------------------------
1 | kind: Deployment
2 | apiVersion: apps/v1
3 | metadata:
4 | name: llm-judge-backend
5 | labels:
6 | app: llm-judge-backend
7 | spec:
8 | replicas: 1
9 | selector:
10 | matchLabels:
11 | app: llm-judge-backend
12 | template:
13 | metadata:
14 | labels:
15 | app: llm-judge-backend
16 | deployment: llm-judge-backend
17 | spec:
18 | containers:
19 | - resources: {}
20 | terminationMessagePath: /dev/termination-log
21 | name: llm-judge-backend
22 | ports:
23 | - containerPort: 3001
24 | protocol: TCP
25 | - containerPort: 8080
26 | protocol: TCP
27 | imagePullPolicy: IfNotPresent
28 | envFrom:
29 | - secretRef:
30 | name: llm-judge-secret
31 | image: backend-image-name:latest
32 | volumeMounts:
33 | - name: mongodb-cert-volume
34 | readOnly: true
35 | mountPath: /app/backend/cert
36 | volumes:
37 | - name: mongodb-cert-volume
38 | secret:
39 | secretName: mongodb-cert-secret
40 | defaultMode: 420
--------------------------------------------------------------------------------
/JudgeIt-App/public/next.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/icons/IBMIconTop.jsx:
--------------------------------------------------------------------------------
1 | import * as React from 'react';
2 | import SvgIcon from '@mui/material/SvgIcon';
3 |
4 | export default function IBMIcon() {
5 |
6 | return (
7 |
8 |
9 |
10 | );
11 | }
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/SignIn.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 |
3 | import { signIn } from "next-auth/react";
4 | import { useSearchParams } from "next/navigation";
5 | import IBMIcon from "./icons/IBMIcon";
6 | import { LineWeight } from "@mui/icons-material";
7 | import { Grid } from "@mui/material";
8 | import React, { Suspense } from "react";
9 |
10 | function SignInWithIBMIdContent() {
11 | const searchParams = useSearchParams();
12 | const callbackUrl = searchParams.get("callbackUrl") || "/";
13 |
14 | return (
15 |
16 |
17 |
25 | signIn("IBMid", { callbackUrl: callbackUrl })}>
26 |
34 |
35 | Sign in with IBMid
36 |
37 |
38 |
39 |
40 |
41 | );
42 | }
43 |
44 | export default function SignInWithIBMId() {
45 | return (
46 | Loading...}>
47 |
48 |
49 | );
50 | }
51 |
--------------------------------------------------------------------------------
/REST-Service/app/src/services/MongoService.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dotenv import load_dotenv
3 | from pymongo import MongoClient
4 | from pymongo.errors import ConnectionFailure
5 | from bson.objectid import ObjectId
6 |
7 | load_dotenv()
8 |
9 | class MongoService:
10 |
11 | def __init__(self):
12 | # MongoDB backend
13 | MONGO_URL=os.getenv('MONGO_URL')
14 | MONGO_USER=os.getenv('MONGO_USER')
15 | MONGO_PASS=os.getenv('MONGO_PASS')
16 |
17 | self.MONGO_DB=os.getenv('MONGO_DB')
18 |
19 | ##f"mongodb://{MONGO_USER}:{MONGO_PASS}@{MONGO_URL}"
20 |
21 | client = MongoClient(
22 |
23 | f"mongodb://{MONGO_USER}:{MONGO_PASS}@{MONGO_URL}/{self.MONGO_DB}?authSource={self.MONGO_DB}",
24 | ssl=True,
25 | tlsCAFile="cert/mongo.crt"
26 | )
27 | self.client = client
28 | print(f"mongo client:{client}yyyy")
29 |
30 | def get_db(self):
31 | db = self.client[self.MONGO_DB]
32 | return db
33 |
34 | def get_collection(self, collection_name):
35 | collection = self.get_db()[collection_name]
36 | return collection
37 |
38 | def get_request_history_collection(self):
39 | return self.get_collection('request_histories')
40 |
41 | def get_experiment_collection(self):
42 | return self.get_collection('experiments')
43 |
44 | def find_one(self, collection, id):
45 | one = collection.find_one({'_id': ObjectId(id)})
46 | return one
47 |
48 |
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/JudgeIt-App/public/vercel.svg:
--------------------------------------------------------------------------------
1 |
2 |
7 |
--------------------------------------------------------------------------------
/REST-Service/deployment/base/rest-app/secret.yaml:
--------------------------------------------------------------------------------
1 | kind: Secret
2 | apiVersion: v1
3 | metadata:
4 | name: mongodb-cert-secret
5 | data:
6 | mongo.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUREekNDQWZlZ0F3SUJBZ0lKQU5FSDU4eTIva3pITUEwR0NTcUdTSWIzRFFFQkN3VUFNQjR4SERBYUJnTlYKQkFNTUUwbENUU0JEYkc5MVpDQkVZWFJoWW1GelpYTXdIaGNOTVRnd05qSTFNVFF5T1RBd1doY05Namd3TmpJeQpNVFF5T1RBd1dqQWVNUnd3R2dZRFZRUUREQk5KUWswZ1EyeHZkV1FnUkdGMFlXSmhjMlZ6TUlJQklqQU5CZ2txCmhraUc5dzBCQVFFRkFBT0NBUThBTUlJQkNnS0NBUUVBOGxwYVFHemNGZEdxZU1sbXFqZmZNUHBJUWhxcGQ4cUoKUHIzYklrclhKYlRjSko5dUlja1NVY0NqdzRaL3JTZzhublQxM1NDY09sKzF0bys3a2RNaVU4cU9XS2ljZVlaNQp5K3laWWZDa0dhaVpWZmF6UUJtNDV6QnRGV3YrQUIvOGhmQ1RkTkY3Vlk0c3BhQTNvQkUyYVM3T0FOTlNSWlNLCnB3eTI0SVVnVWNJTEpXK21jdlc4MFZ4K0dYUmZEOVl0dDZQUkpnQmhZdVVCcGd6dm5nbUNNR0JuK2wyS05pU2YKd2VvdllEQ0Q2Vm5nbDIrNlc5UUZBRnRXWFdnRjNpRFFENW5sL240bXJpcE1TWDZVRy9uNjY1N3U3VERkZ2t2QQoxZUtJMkZMellLcG9LQmU1cmNuck03bkhnTmMvbkNkRXM1SmVjSGIxZEh2MVFmUG02cHpJeHdJREFRQUJvMUF3ClRqQWRCZ05WSFE0RUZnUVVLMytYWm8xd3lLcytERW9ZWGJIcnV3U3BYamd3SHdZRFZSMGpCQmd3Rm9BVUszK1gKWm8xd3lLcytERW9ZWGJIcnV3U3BYamd3REFZRFZSMFRCQVV3QXdFQi96QU5CZ2txaGtpRzl3MEJBUXNGQUFPQwpBUUVBSmY1ZHZselVwcWFpeDI2cUpFdXFGRzBJUDU3UVFJNVRDUko2WHQvc3VwUkhvNjNlRHZLdzh6Ujd0bFdRCmxWNVAwTjJ4d3VTbDlacUFKdDcvay8zWmVCK25Zd1BveU8zS3ZLdkFUdW5SdmxQQm40RldWWGVhUHNHKzdmaFMKcXNlam1reW9uWXc3N0hSekdPekpINFpnOFVONm1mcGJhV1NzeWFFeHZxa25DcDlTb1RRUDNENjdBeldxYjF6WQpkb3FxZ0dJWjJueENrcDUvRlh4Ri9UTWI1NXZ0ZVRRd2ZnQnk2MGpWVmtiRjdlVk9XQ3YwS2FOSFBGNWhycWJOCmkrM1hqSjcvcGVGM3hNdlRNb3kzNURjVDNFMlplU1Zqb3VaczE1Tzkwa0kzazJkYVMyT0hKQUJXMHZTajRuTHoKK1BRenAvQjljUW1PTzhkQ2UwNDlRM29hVUE9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCgo=
7 | type: Opaque
8 |
--------------------------------------------------------------------------------
/REST-Service/main.py:
--------------------------------------------------------------------------------
1 | from fastapi import FastAPI, Request, HTTPException
2 | from fastapi.middleware.trustedhost import TrustedHostMiddleware
3 | import uvicorn
4 | import logging
5 | from dotenv import load_dotenv
6 | from app.route.root import routes as root_api
7 | from app.route.llm_judge import routes as llm_judge_api
8 | from app.route.llm_manage import routes as judge_management_api
9 | from fastapi.middleware.cors import CORSMiddleware
10 | import os
11 | from app.src.config.TimeoutMiddleware import TimeoutMiddleware
12 |
13 | load_dotenv()
14 | platform = os.environ.get("PLATFORM")
15 | server_url = os.environ.get("SERVER_URL", default="http://localhost:3001")
16 |
17 | app = FastAPI(
18 | title="LLM JUDGE API",
19 | description="This api will be used to judge llm response and get ratings and feedback",
20 | version="1.0.1-fastapi",
21 | servers=[
22 | {
23 | "url": server_url
24 | }
25 | ],
26 | )
27 |
28 | logging.basicConfig(level=logging.INFO)
29 | logger = logging.getLogger('api-service')
30 |
31 | # Register blueprints
32 | app.include_router(root_api.root_api_route)
33 | app.include_router(llm_judge_api.judge_api_route)
34 | app.include_router(judge_management_api.judge_management_api_route)
35 |
36 | origins = [ "*"]
37 |
38 | app.add_middleware(
39 | CORSMiddleware,
40 | allow_origins=origins,
41 | allow_credentials=False,
42 | allow_methods=["*"],
43 | allow_headers=["*"],
44 | )
45 |
46 | app.add_middleware(TimeoutMiddleware, timeout=600) # Timeout set to 600 seconds (10 minutes)
47 |
48 | if __name__ == '__main__':
49 | uvicorn.run("main:app", host='0.0.0.0', port=3001)
--------------------------------------------------------------------------------
/JudgeIt-App/app/pages/help/page.js:
--------------------------------------------------------------------------------
1 | import BatchInstructions from "@/components/globals/BatchInstructions";
2 | import Footer from "@/components/globals/Footer";
3 | import SingleInstructions from "@/components/globals/SingleInstructions";
4 | import { Box, Grid, Paper, Typography } from "@mui/material";
5 | import React from "react";
6 |
7 | const HelperPage = () => {
8 | return (
9 |
18 |
19 |
20 |
27 | Documentation
28 |
29 |
30 |
31 |
32 |
37 |
38 |
39 |
40 |
41 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | );
56 | };
57 |
58 | export default HelperPage;
59 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/icons/IBMIcon.jsx:
--------------------------------------------------------------------------------
1 | import * as React from "react";
2 | import SvgIcon from "@mui/material/SvgIcon";
3 |
4 | export default function IBMIcon() {
5 | return (
6 |
7 |
12 |
13 | );
14 | }
15 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/DeleteConfirmationDialog.jsx:
--------------------------------------------------------------------------------
1 | import React, { useState } from 'react';
2 | import Button from '@mui/material/Button';
3 | import Dialog from '@mui/material/Dialog';
4 | import DialogActions from '@mui/material/DialogActions';
5 | import DialogContent from '@mui/material/DialogContent';
6 | import DialogContentText from '@mui/material/DialogContentText';
7 | import DialogTitle from '@mui/material/DialogTitle';
8 |
9 | const DeleteConfirmationDialog = ({ itemName, onDelete }) => {
10 | const [open, setOpen] = useState(false);
11 |
12 | const handleClickOpen = () => {
13 | setOpen(true);
14 | };
15 |
16 | const handleClose = () => {
17 | setOpen(false);
18 | };
19 |
20 | const handleConfirmDelete = () => {
21 | onDelete(); // Call the delete action
22 | handleClose(); // Close the dialog
23 | };
24 |
25 | return (
26 |
27 |
28 | Delete {itemName}
29 |
30 |
36 |
37 | {"Confirm Delete"}
38 |
39 |
40 |
41 | Are you sure you want to delete {itemName}? This action cannot be undone.
42 |
43 |
44 |
45 |
46 | Cancel
47 |
48 |
49 | Confirm Delete
50 |
51 |
52 |
53 |
54 | );
55 | };
56 |
57 | export default DeleteConfirmationDialog;
58 |
--------------------------------------------------------------------------------
/REST-Service/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | fastapi_app:
3 | container_name: fastapi_app
4 | platform: linux/amd64
5 | image: fastapi_app_image
6 | #volumes:
7 | # - ./app:/app
8 | ports:
9 | - 3001:3001
10 | environment:
11 | - WATSONX_URL=https://us-south.ml.cloud.ibm.com
12 | - WX_PROJECT_ID=***
13 | - IBM_CLOUD_API_KEY=***
14 | - LLM_JUDGE_API_KEY=JudgeIt-Secret-Api-Key
15 | - WX_PLATFORM=saas
16 | - WX_USER=''
17 | - WX_GOV_REGION=eu-de
18 | - CELERY_BROKER_URL=redis://redis:6379/0
19 | - CELERY_RESULT_BACKEND=redis://redis:6379/0
20 | - SERVER_URL=http://localhost:3001
21 | - MONGO_URL=***
22 | - MONGO_USER=***
23 | - MONGO_PASS=***
24 | - MONGO_DB=judge_it_dev
25 | - WX_NEG_TEST_MODEL=mistralai/mistral-medium-2505
26 | - WX_GOV_INSTANCE=
27 | restart: always
28 | redis:
29 | container_name: redis
30 | image: redis:7.2.5-alpine
31 | restart: always
32 | celery_worker:
33 | container_name: celery_worker
34 | build: .
35 | #volumes:
36 | # - ./app:/app
37 | command: celery -A app.celery.celery_worker.celery worker --loglevel=info
38 | environment:
39 | - WATSONX_URL=https://us-south.ml.cloud.ibm.com
40 | - WX_PROJECT_ID=***
41 | - WX_PLATFORM=saas
42 | - WX_USER=''
43 | - WX_GOV_REGION=eu-de
44 | - IBM_CLOUD_API_KEY=***
45 | - CELERY_BROKER_URL=redis://redis:6379/0
46 | - CELERY_RESULT_BACKEND=redis://redis:6379/0
47 | - WX_NEG_TEST_MODEL=mistralai/mistral-medium-2505
48 | - WX_GOV_INSTANCE=
49 | depends_on:
50 | - fastapi_app
51 | - redis
52 | restart: always
53 | flower:
54 | container_name: flower
55 | build: .
56 | command: celery --broker=redis://redis:6379/0 flower --port=5555
57 | ports:
58 | - 5556:5555
59 | environment:
60 | - CELERY_BROKER_URL=redis://redis:6379/0
61 | - CELERY_RESULT_BACKEND=redis://redis:6379/0
62 | depends_on:
63 | - fastapi_app
64 | - redis
65 | - celery_worker
66 | restart: always
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/MultiTurnWithConversationForm.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import React from "react";
3 | import { TextField, Box } from "@mui/material";
4 |
5 | const MultiTurnWithConversationForm = ({
6 | values,
7 | handleChange,
8 | handleBlur,
9 | errors,
10 | touched,
11 | }) => {
12 | return (
13 |
14 |
15 |
27 |
28 |
29 |
39 |
40 |
41 |
51 |
52 |
53 |
63 |
64 |
65 | );
66 | };
67 |
68 | export default MultiTurnWithConversationForm;
69 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/SoloResult.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import {
3 | Alert,
4 | Table,
5 | TableHead,
6 | TableRow,
7 | TableCell,
8 | TableBody,
9 | Paper,
10 | } from "@mui/material";
11 | import {
12 | API_TYPE_MULTITURN,
13 | API_TYPE_SINGLETURN,
14 | API_TYPE_RATING,
15 | API_TYPE_SIMILARITY,
16 | } from "@/services/Config";
17 |
18 | import { grade_map_rating, grade_map_similarity, grade_map_multiturn } from "@/services/Config";
19 |
20 | const grade_col_name = "JudgeIt Score"
21 | const explanation_col_name = "JudgeIt Reasoning"
22 |
23 | const SoloResult = ({ data, api_type }) => {
24 | return (
25 |
29 |
30 |
31 | {api_type === API_TYPE_RATING && (
32 |
33 | {grade_col_name}
34 | {explanation_col_name}
35 |
36 | )}
37 | {api_type === API_TYPE_SIMILARITY && (
38 |
39 | {grade_col_name}
40 | {explanation_col_name}
41 |
42 | )}
43 | {(api_type === API_TYPE_MULTITURN || api_type === API_TYPE_SINGLETURN) && (
44 |
45 | {grade_col_name}
46 |
47 | )}
48 |
49 |
50 | {api_type === API_TYPE_RATING && (
51 |
52 | {grade_map_rating[data.Grade]}
53 | {data.Explanation}
54 |
55 | )}
56 | {api_type === API_TYPE_SIMILARITY && (
57 |
58 | {grade_map_similarity[data.Grade]}
59 | {data.Explanation}
60 |
61 | )}
62 | {(api_type === API_TYPE_MULTITURN || api_type === API_TYPE_SINGLETURN) && (
63 |
64 | {grade_map_multiturn[data.Grade]}
65 |
66 | )}
67 |
68 |
69 |
70 | );
71 | };
72 |
73 | export default SoloResult;
74 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/RatingSimilarityDataGrid.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import React from "react";
3 | import { DataGrid } from "@mui/x-data-grid";
4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
5 | import { API_TYPE_RATING, grade_map_rating, grade_map_similarity } from "@/services/Config";
6 |
7 | const RatingSimilarityDataGrid = ({ serverData }) => {
8 | const columns = [
9 | {
10 | field: "id",
11 | headerName: "Id",
12 | hide: true,
13 | },
14 | {
15 | field: "name",
16 | headerName: "Name",
17 | width: "250",
18 | },
19 | {
20 | field: "eval_type",
21 | headerName: "Eval Type",
22 | },
23 | {
24 | field: "model",
25 | headerName: "Model",
26 | width: "250",
27 | },
28 | {
29 | field: "golden_text",
30 | headerName: "Golden Text",
31 | width: "400",
32 | },
33 | {
34 | field: "generated_text",
35 | headerName: "Generated Text",
36 | width: "400",
37 | },
38 | {
39 | field: "Grade",
40 | headerName: "JudgeIt Score",
41 | width: 100,
42 | },
43 | {
44 | field: "Explanation",
45 | headerName: "JudgeIt Reasoning",
46 | width: "400",
47 | },
48 | ];
49 |
50 | return (
51 |
52 | {" "}
53 | {
57 | return {
58 | id: item._id,
59 | name: item.name,
60 | eval_type: item.eval_type,
61 | model: item.content.query.model,
62 | golden_text: item.content.query.golden_text,
63 | generated_text: item.content.query.generated_text,
64 | Grade: (item.eval_type === API_TYPE_RATING) ? grade_map_rating[item.content.result.Grade] : grade_map_similarity[item.content.result.Grade],
65 | Explanation: item.content.result.Explanation,
66 | };
67 | }),
68 | }}
69 | density="compact"
70 | getRowHeight={() => "auto"}
71 | autoHeight={true}
72 | initialState={{
73 | ...{
74 | columns: columns,
75 | rows: [],
76 | }.initialState,
77 | pagination: { paginationModel: { pageSize: 10 } },
78 | }}
79 | pageSizeOptions={[5, 10, 25]}
80 | slots={{ toolbar: DataGridToolbar }}
81 | />
82 |
83 | );
84 | };
85 |
86 | export default RatingSimilarityDataGrid;
87 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DataGridMultiTurnConversation.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import React from "react";
3 | import { DataGrid } from "@mui/x-data-grid";
4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
5 |
6 | const DataGridMultiTurnConversation = ({ serverData }) => {
7 | const columns = [
8 | {
9 | field: "id",
10 | headerName: "Id",
11 | hide: true,
12 | },
13 | {
14 | field: "name",
15 | headerName: "Name",
16 | width: "250",
17 | },
18 | {
19 | field: "eval_type",
20 | headerName: "Eval Type",
21 | },
22 | {
23 | field: "model",
24 | headerName: "Model",
25 | width: "250",
26 | },
27 | {
28 | field: "conversation_history",
29 | headerName: "Conversation history",
30 | width: "400",
31 | },
32 | {
33 | field: "follow_up_query",
34 | headerName: "Follow up query",
35 | width: "400",
36 | },
37 | {
38 | field: "golden_query",
39 | headerName: "Golden query",
40 | width: "400",
41 | },
42 | {
43 | field: "rewritten_query",
44 | headerName: "Rewritten query",
45 | width: "400",
46 | },
47 | {
48 | field: "Grade",
49 | headerName: "Grade",
50 | width: 100,
51 | }
52 | ];
53 |
54 | return (
55 |
56 | {" "}
57 | {
61 | return {
62 | id: item._id,
63 | name: item.name,
64 | eval_type: item.eval_type,
65 | model: item.content.query.model,
66 | conversation_history: item.content.query.conversation_history,
67 | follow_up_query: item.content.query.follow_up_query,
68 | golden_query: item.content.query.golden_query,
69 | rewritten_query: item.content.query.rewritten_query,
70 | Grade: item.content.result.Grade
71 | };
72 | }),
73 | }}
74 | density="compact"
75 | getRowHeight={() => "auto"}
76 | autoHeight={true}
77 | initialState={{
78 | ...{
79 | columns: columns,
80 | rows: [],
81 | }.initialState,
82 | pagination: { paginationModel: { pageSize: 10 } },
83 | }}
84 | pageSizeOptions={[5, 10, 25]}
85 | slots={{ toolbar: DataGridToolbar }}
86 | />
87 |
88 | );
89 | };
90 |
91 | export default DataGridMultiTurnConversation;
92 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DataGridMultiTurnSummary.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import React from "react";
3 | import { DataGrid } from "@mui/x-data-grid";
4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
5 |
6 | const DataGridMultiTurnSummaryConversation = ({ serverData }) => {
7 | const columns = [
8 | {
9 | field: "id",
10 | headerName: "Id",
11 | hide: true,
12 | },
13 | {
14 | field: "name",
15 | headerName: "Name",
16 | width: "250",
17 | },
18 | {
19 | field: "experiment_name",
20 | headerName: "Experiment Name",
21 | width: "250",
22 | },
23 | {
24 | field: "eval_type",
25 | headerName: "Eval Type",
26 | },
27 | {
28 | field: "conversation_history",
29 | headerName: "Conversation history",
30 | width: "500",
31 | },
32 | {
33 | field: "follow_up_query",
34 | headerName: "Follow up query",
35 | width: "300",
36 | },
37 | {
38 | field: "golden_query",
39 | headerName: "Golden query",
40 | width: "300",
41 | },
42 | {
43 | field: "rewritten_query",
44 | headerName: "Rewritten query",
45 | width: "300",
46 | },
47 | {
48 | field: "Grade",
49 | headerName: "JudgeIt Score",
50 | width: 100,
51 | }
52 | ];
53 |
54 | return (
55 |
56 | {" "}
57 | {
61 | return {
62 | id: item._id,
63 | name: item.name,
64 | eval_type: item.eval_type,
65 | experiment_name: item.experiment_name,
66 | conversation_history: item.conversation_history,
67 | follow_up_query: item.follow_up_query,
68 | golden_query: item.golden_query,
69 | rewritten_query: item.rewritten_query,
70 | Grade: (item?.Grade) ? item?.Grade : item?.judgeit_score
71 | };
72 | }),
73 | }}
74 | density="compact"
75 | getRowHeight={() => "auto"}
76 | autoHeight={true}
77 | initialState={{
78 | ...{
79 | columns: columns,
80 | rows: [],
81 | }.initialState,
82 | pagination: { paginationModel: { pageSize: 10 } },
83 | }}
84 | pageSizeOptions={[5, 10, 25]}
85 | slots={{ toolbar: DataGridToolbar }}
86 | />
87 |
88 | );
89 | };
90 |
91 | export default DataGridMultiTurnSummaryConversation;
92 |
--------------------------------------------------------------------------------
/JudgeIt-App/utils/Helper.js:
--------------------------------------------------------------------------------
1 | import {
2 | API_TYPE_MULTITURN,
3 | API_TYPE_RATING,
4 | API_TYPE_SIMILARITY,
5 | grade_map_multiturn,
6 | grade_map_rating,
7 | grade_map_similarity,
8 | } from "@/services/Config";
9 |
10 | export function getRandomInt(max) {
11 | return Math.floor(Math.random() * max);
12 | }
13 |
14 | export function generateRandomString(length = 4) {
15 | const characters =
16 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
17 | let result = "";
18 | for (let i = 0; i < length; i++) {
19 | const randomIndex = Math.floor(Math.random() * characters.length);
20 | result += characters.charAt(randomIndex);
21 | }
22 | return result;
23 | }
24 |
25 | // Function to generate columns dynamically from JSON object keys
26 | export const generateColumns = (jsonObject) => {
27 | return Object.keys(jsonObject).map((key) => ({
28 | field: key,
29 | headerName: rename_grade_explanation_cloumn_name(key), // Capitalize the header
30 | width: 300, // You can adjust the width or make it dynamic
31 | }));
32 | };
33 |
34 | const rename_grade_explanation_cloumn_name = (column_name) => {
35 | if (column_name === "Grade") {
36 | return "JudgeIt Score";
37 | } else if (column_name === "Explanation") {
38 | return "JudgeIt Reasoning";
39 | } else {
40 | return column_name.charAt(0).toUpperCase() + column_name.slice(1);
41 | }
42 | };
43 |
44 | // Function to generate rows dynamically from JSON object
45 | export const generateRows = (jsonObject, eval_type) => {
46 | const firstKey = Object.keys(jsonObject)[0]; // Get the first key to check structure
47 | const rowIds = Object.keys(jsonObject[firstKey]); // Assuming same structure for all keys
48 |
49 | return rowIds.map((_, index) => {
50 | const rowData = { id: index }; // Initialize row with id
51 | Object.keys(jsonObject).forEach((field) => {
52 | rowData[field] = get_rating_label(
53 | eval_type,
54 | field,
55 | jsonObject[field][index]
56 | ); // Add data for each field
57 | });
58 | return rowData;
59 | });
60 | };
61 |
62 | const get_rating_label = (eval_type, column_name, value) => {
63 | if (column_name !== "Grade") return value;
64 |
65 | const gradeMap = {
66 | [API_TYPE_RATING]: grade_map_rating,
67 | [API_TYPE_SIMILARITY]: grade_map_similarity,
68 | [API_TYPE_MULTITURN]: grade_map_multiturn,
69 | };
70 |
71 | return gradeMap[eval_type]?.[value] || value;
72 | };
73 |
74 | export function trimText(text) {
75 | if (text.length > 15) {
76 | return text.substring(0, 15) + "..";
77 | }
78 | return text;
79 | }
80 |
--------------------------------------------------------------------------------
/Framework/wml_setup.py:
--------------------------------------------------------------------------------
1 | from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
2 | from ibm_watsonx_ai.foundation_models import Model
3 |
4 | #config Watsonx.ai environment
5 | api_key = ''
6 | ibm_cloud_url = 'https://us-south.ml.cloud.ibm.com'
7 | project_id = ''
8 |
9 | def send_to_watsonxai(prompts,
10 | model_id="MIXTRAL",
11 | decoding_method="greedy",
12 | max_new_tokens=500,
13 | min_new_tokens=30,
14 | temperature=1.0,
15 | repetition_penalty=1.0
16 | ):
17 | if model_id == "MIXTRAL":
18 | model_name = "mistralai/mixtral-8x7b-instruct-v01"
19 | elif model_id == "LLAMA3":
20 | model_name="meta-llama/llama-3-70b-instruct"
21 | # Instantiate parameters for text generation
22 | model_params = {
23 | GenParams.DECODING_METHOD: decoding_method,
24 | GenParams.MIN_NEW_TOKENS: min_new_tokens,
25 | GenParams.MAX_NEW_TOKENS: max_new_tokens,
26 | GenParams.RANDOM_SEED: 42,
27 | GenParams.TEMPERATURE: temperature,
28 | GenParams.REPETITION_PENALTY: repetition_penalty,
29 | }
30 | model = Model(
31 | model_id=model_name,
32 | params=model_params,
33 | credentials={
34 | "url" : ibm_cloud_url,
35 | "apikey" : api_key
36 | },
37 | project_id=project_id)
38 |
39 | response=model.generate_text(prompts)
40 | return response
41 |
42 |
43 | def send_to_watsonxai_multi_turn(prompts,
44 | model_id="MIXTRAL",
45 | decoding_method="greedy",
46 | max_new_tokens=128,
47 | temperature=0.7,
48 | repetition_penalty=1.0
49 | ):
50 | if model_id == "MIXTRAL":
51 | model_name = "mistralai/mixtral-8x7b-instruct-v01"
52 | elif model_id == "LLAMA3":
53 | model_name="meta-llama/llama-3-70b-instruct"
54 | # Instantiate parameters for text generation
55 | model_params = {
56 | GenParams.DECODING_METHOD: decoding_method,
57 | GenParams.MAX_NEW_TOKENS: max_new_tokens,
58 | GenParams.RANDOM_SEED: 42,
59 | GenParams.TEMPERATURE: temperature,
60 | GenParams.REPETITION_PENALTY: repetition_penalty,
61 | }
62 | model = Model(
63 | model_id=model_name,
64 | params=model_params,
65 | credentials={
66 | "url" : ibm_cloud_url,
67 | "apikey" : api_key
68 | },
69 | project_id=project_id)
70 |
71 | response=model.generate_text(prompts)
72 | return response
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/RatingSimilarityDataGridSummary.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import React from "react";
3 | import { DataGrid } from "@mui/x-data-grid";
4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
5 | import { API_TYPE_RATING, grade_map_rating, grade_map_similarity } from "@/services/Config";
6 |
7 | const RatingSimilarityDataGridSummary = ({ serverData }) => {
8 | const columns = [
9 | {
10 | field: "id",
11 | headerName: "Id",
12 | hide: true,
13 | },
14 | {
15 | field: "Question",
16 | headerName: "Question",
17 | width: "250",
18 | },
19 | {
20 | field: "experiment_name",
21 | headerName: "Experiment Name",
22 | width: "100",
23 | },
24 | {
25 | field: "name",
26 | headerName: "Name",
27 | width: "100",
28 | },
29 | {
30 | field: "eval_type",
31 | headerName: "Eval Type",
32 | },
33 | {
34 | field: "golden_text",
35 | headerName: "Golden Text",
36 | width: "400",
37 | },
38 | {
39 | field: "generated_text",
40 | headerName: "Generated Text",
41 | width: "400",
42 | },
43 | {
44 | field: "Grade",
45 | headerName: "JudgeIt Score",
46 | width: 100,
47 | },
48 | {
49 | field: "Explanation",
50 | headerName: "JudgeIt Reasoning",
51 | width: "400",
52 | },
53 | ];
54 |
55 | return (
56 |
57 | {
61 | return {
62 | id: item._id,
63 | Question: item.question,
64 | experiment_name: item.experiment_name,
65 | name: item.name,
66 | eval_type: item.eval_type,
67 | golden_text: item.golden_text,
68 | generated_text: item.generated_text,
69 | Grade: (item?.Grade) ? item?.Grade : item?.judgeit_score,
70 | Explanation: (item?.Explanation) ? item?.Grade : item?.judgeit_reasoning
71 | };
72 | }),
73 | }}
74 | density="compact"
75 | getRowHeight={() => "auto"}
76 | autoHeight={true}
77 | initialState={{
78 | ...{
79 | columns: columns,
80 | rows: [],
81 | }.initialState,
82 | pagination: { paginationModel: { pageSize: 10 } },
83 | }}
84 | pageSizeOptions={[5, 10, 25]}
85 | slots={{ toolbar: DataGridToolbar }}
86 | />
87 |
88 | );
89 | };
90 |
91 | export default RatingSimilarityDataGridSummary;
92 |
--------------------------------------------------------------------------------
/JudgeIt-App/app/globals.css:
--------------------------------------------------------------------------------
1 | :root {
2 | --max-width: 1100px;
3 | --border-radius: 12px;
4 | --font-mono: ui-monospace, Menlo, Monaco, "Cascadia Mono", "Segoe UI Mono",
5 | "Roboto Mono", "Oxygen Mono", "Ubuntu Monospace", "Source Code Pro",
6 | "Fira Mono", "Droid Sans Mono", "Courier New", monospace;
7 |
8 | --foreground-rgb: 0, 0, 0;
9 | --background-start-rgb: 214, 219, 220;
10 | --background-end-rgb: 255, 255, 255;
11 |
12 | --primary-glow: conic-gradient(
13 | from 180deg at 50% 50%,
14 | #16abff33 0deg,
15 | #0885ff33 55deg,
16 | #54d6ff33 120deg,
17 | #0071ff33 160deg,
18 | transparent 360deg
19 | );
20 | --secondary-glow: radial-gradient(
21 | rgba(255, 255, 255, 1),
22 | rgba(255, 255, 255, 0)
23 | );
24 |
25 | --tile-start-rgb: 239, 245, 249;
26 | --tile-end-rgb: 228, 232, 233;
27 | --tile-border: conic-gradient(
28 | #00000080,
29 | #00000040,
30 | #00000030,
31 | #00000020,
32 | #00000010,
33 | #00000010,
34 | #00000080
35 | );
36 |
37 | --callout-rgb: 238, 240, 241;
38 | --callout-border-rgb: 172, 175, 176;
39 | --card-rgb: 180, 185, 188;
40 | --card-border-rgb: 131, 134, 135;
41 | }
42 |
43 | @media (prefers-color-scheme: dark) {
44 | :root {
45 | --foreground-rgb: 255, 255, 255;
46 | --background-start-rgb: 0, 0, 0;
47 | --background-end-rgb: 0, 0, 0;
48 |
49 | --primary-glow: radial-gradient(rgba(1, 65, 255, 0.4), rgba(1, 65, 255, 0));
50 | --secondary-glow: linear-gradient(
51 | to bottom right,
52 | rgba(1, 65, 255, 0),
53 | rgba(1, 65, 255, 0),
54 | rgba(1, 65, 255, 0.3)
55 | );
56 |
57 | --tile-start-rgb: 2, 13, 46;
58 | --tile-end-rgb: 2, 5, 19;
59 | --tile-border: conic-gradient(
60 | #ffffff80,
61 | #ffffff40,
62 | #ffffff30,
63 | #ffffff20,
64 | #ffffff10,
65 | #ffffff10,
66 | #ffffff80
67 | );
68 |
69 | --callout-rgb: 20, 20, 20;
70 | --callout-border-rgb: 108, 108, 108;
71 | --card-rgb: 100, 100, 100;
72 | --card-border-rgb: 200, 200, 200;
73 | }
74 | }
75 |
76 | * {
77 | box-sizing: border-box;
78 | padding: 0;
79 | margin: 0;
80 | }
81 |
82 | html,
83 | body {
84 | max-width: 100vw;
85 | overflow-x: hidden;
86 | }
87 |
88 | body {
89 | color: rgb(var(--foreground-rgb));
90 | background: linear-gradient(
91 | to bottom,
92 | transparent,
93 | rgb(var(--background-end-rgb))
94 | )
95 | rgb(var(--background-start-rgb));
96 | }
97 |
98 | a {
99 | color: inherit;
100 | text-decoration: none;
101 | }
102 |
103 | @media (prefers-color-scheme: dark) {
104 | html {
105 | color-scheme: dark;
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DataGridSingleTurn.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import React from "react";
3 | import { DataGrid } from "@mui/x-data-grid";
4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
5 |
6 | const DataGridSingleTurn = ({ serverData }) => {
7 | const columns = [
8 | {
9 | field: "id",
10 | headerName: "Id",
11 | hide: true,
12 | },
13 | {
14 | field: "name",
15 | headerName: "Name",
16 | width: "250",
17 | },
18 | {
19 | field: "eval_type",
20 | headerName: "Eval Type",
21 | },
22 | {
23 | field: "model",
24 | headerName: "Model",
25 | width: "250",
26 | },
27 | {
28 | field: "previous_question",
29 | headerName: "Previous Question",
30 | width: "400",
31 | },
32 | {
33 | field: "previous_answer",
34 | headerName: "Previous Answer",
35 | width: "400",
36 | },
37 | {
38 | field: "current_question",
39 | headerName: "Current Question",
40 | width: "400",
41 | },
42 | {
43 | field: "golden_rewritten_question",
44 | headerName: "Golden Rewritten Question",
45 | width: "400",
46 | },
47 | {
48 | field: "rewritten_question",
49 | headerName: "Rewritten Question",
50 | width: "400",
51 | },
52 | {
53 | field: "Grade",
54 | headerName: "Grade",
55 | width: 100,
56 | }
57 | ];
58 |
59 | return (
60 |
61 | {" "}
62 | {
66 | return {
67 | id: item._id,
68 | name: item.name,
69 | eval_type: item.eval_type,
70 | model: item.content.query.model,
71 | previous_question: item.content.query.previous_question,
72 | previous_answer: item.content.query.previous_answer,
73 | current_question: item.content.query.current_question,
74 | golden_rewritten_question: item.content.query.golden_rewritten_question,
75 | rewritten_question: item.content.query.rewritten_question,
76 | Grade: item.content.result.Grade
77 | };
78 | }),
79 | }}
80 | density="compact"
81 | getRowHeight={() => "auto"}
82 | autoHeight={true}
83 | initialState={{
84 | ...{
85 | columns: columns,
86 | rows: [],
87 | }.initialState,
88 | pagination: { paginationModel: { pageSize: 10 } },
89 | }}
90 | pageSizeOptions={[5, 10, 25]}
91 | slots={{ toolbar: DataGridToolbar }}
92 | />
93 |
94 | );
95 | };
96 |
97 | export default DataGridSingleTurn;
98 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DataGridSingleTurnSummary.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import React from "react";
3 | import { DataGrid } from "@mui/x-data-grid";
4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
5 |
6 | const DataGridSingleTurnSummary = ({ serverData }) => {
7 | const columns = [
8 | {
9 | field: "id",
10 | headerName: "Id",
11 | hide: true,
12 | },
13 | {
14 | field: "name",
15 | headerName: "Name",
16 | width: "250",
17 | },
18 | {
19 | field: "experiment_name",
20 | headerName: "Experiment Name",
21 | width: "250",
22 | },
23 | {
24 | field: "eval_type",
25 | headerName: "Eval Type",
26 | },
27 |
28 | {
29 | field: "previous_question",
30 | headerName: "Previous Question",
31 | width: "400",
32 | },
33 | {
34 | field: "previous_answer",
35 | headerName: "Previous Answer",
36 | width: "400",
37 | },
38 | {
39 | field: "current_question",
40 | headerName: "Current Question",
41 | width: "400",
42 | },
43 | {
44 | field: "golden_rewritten_question",
45 | headerName: "Golden Rewritten Question",
46 | width: "400",
47 | },
48 | {
49 | field: "rewritten_question",
50 | headerName: "Rewritten Question",
51 | width: "400",
52 | },
53 | {
54 | field: "Grade",
55 | headerName: "JudgeIt Score",
56 | width: 100,
57 | }
58 | ];
59 |
60 | return (
61 |
62 | {" "}
63 | {
67 | return {
68 | id: item._id,
69 | name: item.name,
70 | eval_type: item.eval_type,
71 | experiment_name: item.experiment_name,
72 | previous_question: item.previous_question,
73 | previous_answer: item.previous_answer,
74 | current_question: item.current_question,
75 | golden_rewritten_question: item.golden_rewritten_question,
76 | rewritten_question: item.rewritten_question,
77 | Grade: (item?.Grade) ? item?.Grade : item?.judgeit_score
78 | };
79 | }),
80 | }}
81 | density="compact"
82 | getRowHeight={() => "auto"}
83 | autoHeight={true}
84 | initialState={{
85 | ...{
86 | columns: columns,
87 | rows: [],
88 | }.initialState,
89 | pagination: { paginationModel: { pageSize: 10 } },
90 | }}
91 | pageSizeOptions={[5, 10, 25]}
92 | slots={{ toolbar: DataGridToolbar }}
93 | />
94 |
95 | );
96 | };
97 |
98 | export default DataGridSingleTurnSummary;
99 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DisplayRequestHistoryMultiTurn.jsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import { Grid, Paper, Box, CircularProgress } from "@mui/material";
3 |
4 | const DisplayRequestHistoryMultiTurnConversation = ({ serverData }) => {
5 | return (
6 | <>
7 |
8 |
14 |
15 |
16 | Experiment name:
17 |
18 |
19 | {serverData.experiment_name}
20 |
21 |
22 |
23 | Request type:
24 |
25 |
26 | {serverData.eval_type}
27 |
28 |
29 |
30 | Conversation History:
31 |
32 |
33 | {serverData.content.query.conversation_history}
34 |
35 |
36 |
37 | Follow up query:
38 |
39 |
40 | {serverData.content.query.follow_up_query}
41 |
42 |
43 | Golden query:
44 |
45 |
46 | {serverData.content.query.golden_query}
47 |
48 |
49 | Rewritten query:
50 |
51 |
52 | {serverData.content.query.rewritten_query}
53 |
54 |
55 | Model:
56 |
57 |
58 | {serverData.content.query.model}
59 |
60 |
61 |
62 |
63 |
64 |
65 |
71 |
72 |
73 | Grade:
74 |
75 |
76 | {serverData.content.result.Grade || serverData.content.result.judgeit_score}
77 |
78 |
79 |
80 |
81 | >
82 | );
83 | };
84 |
85 | export default DisplayRequestHistoryMultiTurnConversation;
86 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/RatingSimilarityForm.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import React from "react";
3 | import { TextField, Box, Tooltip } from "@mui/material";
4 | import InfoOutlinedIcon from "@mui/icons-material/InfoOutlined";
5 |
6 | const RatingSimilarityForm = ({
7 | values,
8 | handleChange,
9 | handleBlur,
10 | errors,
11 | touched,
12 | }) => {
13 | return (
14 |
15 |
16 |
26 |
27 |
28 |
29 |
30 |
31 |
43 |
47 |
48 |
49 |
50 |
57 |
69 |
73 |
74 |
75 |
76 |
77 | );
78 | };
79 |
80 | export default RatingSimilarityForm;
81 |
--------------------------------------------------------------------------------
/REST-Service/app/src/services/WatsonXService.py:
--------------------------------------------------------------------------------
1 | from ibm_watson_machine_learning.foundation_models import Model
2 | from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
3 |
4 | from ibm_watsonx_ai.foundation_models import Model
5 | from langchain_ibm import WatsonxLLM
6 | from langchain_core.prompts import PromptTemplate
7 |
8 | class WatsonXService:
9 |
10 | def __init__(self,
11 | api_key,
12 | project_id,
13 | llm_model_id) -> None:
14 | self.api_key = api_key
15 | self.ibm_cloud_url = 'https://us-south.ml.cloud.ibm.com'
16 | self.project_id = project_id
17 | self.llm_model_id = llm_model_id
18 |
19 | def get_wml_llm_services(self,
20 | decoding_method="greedy",
21 | min_new_tokens=1,
22 | max_new_tokens=200,
23 | repetition_penalty=1,
24 | stop_sequences=['}']) -> WatsonxLLM:
25 |
26 | # llm parameters
27 | generate_parameters = {
28 | "decoding_method": decoding_method,
29 | "min_new_tokens": min_new_tokens,
30 | "max_new_tokens": max_new_tokens,
31 | "repetition_penalty": repetition_penalty,
32 | "stop_sequences": stop_sequences
33 | }
34 |
35 | # instatiate llm
36 | llm_model = WatsonxLLM(apikey=self.api_key,
37 | url=self.ibm_cloud_url,
38 | project_id=self.project_id,
39 | model_id=self.llm_model_id,
40 | params=generate_parameters)
41 | return llm_model
42 |
43 | ## using watsonx machine learning api
44 | def send_to_watsonxai(
45 | self,
46 | prompts,
47 | model_id="meta-llama/llama-3-70b-instruct",
48 | decoding_method="greedy",
49 | max_new_tokens=500,
50 | min_new_tokens=30,
51 | temperature=1.0,
52 | repetition_penalty=1.0
53 | ):
54 |
55 | # Instantiate parameters for text generation
56 | model_params = {
57 | GenParams.DECODING_METHOD: decoding_method,
58 | GenParams.MIN_NEW_TOKENS: min_new_tokens,
59 | GenParams.MAX_NEW_TOKENS: max_new_tokens,
60 | GenParams.RANDOM_SEED: 42,
61 | GenParams.TEMPERATURE: temperature,
62 | GenParams.REPETITION_PENALTY: repetition_penalty,
63 | }
64 |
65 | model = Model(
66 | model_id=model_id,
67 | params=model_params,
68 | credentials={
69 | "url" : self.ibm_cloud_url,
70 | "apikey" : self.api_key
71 | },
72 | project_id=self.project_id)
73 |
74 | response=model.generate_text(prompts)
75 | return response
76 |
--------------------------------------------------------------------------------
/REST-Service/deployment/readme.md:
--------------------------------------------------------------------------------
1 | # Deploy REST Service in OpenShift cluster
2 |
3 | ## Login to OpenShift cluster
4 |
5 | Step 1: Login to openshift console and copy login command
6 |
7 |
8 |
9 | Login with the token or user user and password in the command line
10 |
11 | ## Deployment steps
12 |
13 | - Create a new project
14 |
15 | ```sh
16 | oc new-project llm-judge
17 | ```
18 |
19 | - Set the project name in a variable
20 |
21 | ```sh
22 | export $NAMESPACE_NAME='llm-judge'
23 | ```
24 |
25 | - We are using the OpenShift internal registry; however, you can use any container registry.
26 |
27 | ```sh
28 | export REGISTRY=$(oc get routes -n openshift-image-registry -o jsonpath='{.items[0].spec.host}')
29 | echo $(oc whoami -t) | docker login $REGISTRY -u $(oc whoami) --password-stdin
30 | ```
31 |
32 | - Build the docker image and push it to internal registry
33 |
34 | ```sh
35 | docker build -t $REGISTRY/$NAMESPACE_NAME/backend:v1.0 .
36 | docker push $REGISTRY/$NAMESPACE_NAME/backend:v1.0
37 | ```
38 |
39 | - We have a deployment directory with kustomization. Before you applying the deployment please edit [base/kustomize.yaml](base/kustomization.yaml) file and update the below variables based on the values you have.
40 |
41 | - WATSONX_URL=
42 | - WX_PROJECT_ID=
43 | - IBM_CLOUD_API_KEY=
44 | - LLM_JUDGE_API_KEY=JudgeIt-Secret-Api-Key
45 | - WX_PLATFORM=saas
46 | - WX_USER=
47 | - CELERY_BROKER_URL=redis://redis:6379/0
48 | - CELERY_RESULT_BACKEND=redis://redis:6379/0
49 | - SERVER_URL=
50 | - MONGO_URL=
51 | - MONGO_USER=
52 | - MONGO_PASS=
53 | - MONGO_DB="judgeit_app"
54 |
55 | ```yaml
56 | kind: Kustomization
57 | images:
58 | - name: backend-image-name
59 | newName: image-registry.openshift-image-registry.svc:5000/llm-judge-dev/backend
60 | newTag: v1.0
61 | secretGenerator:
62 | - name: llm-judge-secret
63 | literals:
64 | - WATSONX_URL=
65 | - WX_PROJECT_ID=
66 | - IBM_CLOUD_API_KEY=
67 | - LLM_JUDGE_API_KEY=JudgeIt-Secret-Api-Key
68 | - WX_PLATFORM=saas
69 | - WX_USER=
70 | - CELERY_BROKER_URL=redis://redis:6379/0
71 | - CELERY_RESULT_BACKEND=redis://redis:6379/0
72 | - SERVER_URL=
73 | - MONGO_URL=
74 | - MONGO_USER=
75 | - MONGO_PASS=
76 | - MONGO_DB="judgeit_app"
77 | resources:
78 | - redis/
79 | - celery-worker/
80 | - flower/
81 | - rest-app/
82 | ```
83 |
84 | - Apply the deployment
85 |
86 | ```sh
87 | oc apply -k base/
88 | ```
89 |
90 | - Monitor the deployment
91 |
92 | ```sh
93 | watch oc get deployments,pods
94 | ```
95 |
96 | - Test
97 |
98 | Copy the url from the command executed below and paste it in the browser.
99 |
100 | ```sh
101 | oc get routes/llm-judge-backend -o jsonpath='https://{.spec.host}/docs{"\n"}'
102 | ```
103 |
104 | - Clean up
105 |
106 | ```sh
107 | oc delete -k base/
108 | ```
109 |
--------------------------------------------------------------------------------
/REST-Service/app/src/services/answer_similarity.py:
--------------------------------------------------------------------------------
1 | from langchain_core.prompts import PromptTemplate
2 |
3 | ## Grading a generated text compared to a golden text
4 | SIMILARITY_PROMPT= """Follow these structured steps to accurately assess the similarity between a Golden Text and a Generated Text:
5 | 1. **Role and Task**: Assume the role of an impartial assistant and evaluator. Your task is to assess the similarity between a Golden Text and a Generated Text using the provided information.
6 | 2. **Initial Setup**: Begin by carefully reviewing the Golden Text to understand the key information, entities, and intents it contains. The Golden Text is considered fully correct and comprehensive. Then, examine the Generated Text that needs evaluation.
7 | 3. **Evaluation Criteria**: Evaluate the Generated Text based on the following criteria:
8 | - Output {{"Grade": "1"}} if:
9 | a) The Generated Text matches the Golden Text closely in terms of key entities and intents. Note that these may be worded differently but convey the same meaning.
10 | b) The Generated Text contains all the essential information from the Golden Text, even if presented in a different order or with slight variations in phrasing.
11 | c) The Generated Text includes the core information from the Golden Text and may contain additional relevant details or expansions that don't contradict the original.
12 | - Output {{"Grade": "0"}} if:
13 | a) The Generated Text is missing critical entities or intents that are present in the Golden Text.
14 | b) The Generated Text contains significant factual errors or contradictions when compared to the Golden Text.
15 | c) The overall meaning or intent of the Generated Text substantially differs from the Golden Text.
16 | 4. **Tolerance for Minor Differences**: Allow for minor differences in numerical values, slight variations in proper nouns, and small discrepancies in less critical details, as long as the core meaning and primary facts remain intact.
17 | 5. **Explanation**: After providing the grade, explain your reasoning in 1 sentence, highlighting key similarities or differences that influenced your decision.
18 | 6. **Output Format**: Format your evaluation output strictly as {{"Grade": "evaluated grade", "Explanation": "explanation for grade"}} to ensure clarity and consistency in assessment.
19 | Remember, the goal is to identify substantive similarity rather than expecting word-for-word matches. Focus on the core information, key facts, and overall intent when making your assessment.
20 |
21 | Input:
22 | Golden Text: {prompt_parameter_1}
23 | Generated Text: {prompt_parameter_2}
24 |
25 | Output:
26 | """
27 |
28 | def build_query_similarity_prompt(row):
29 | input_variables = ['prompt_parameter_1', 'prompt_parameter_2']
30 | prompt = PromptTemplate(input_variables=input_variables, template=SIMILARITY_PROMPT)
31 | # create invoke parameter which is a dictionary of your prompt parameters
32 | prompt_data = {'prompt_parameter_1': row['golden_text'],
33 | 'prompt_parameter_2': row['generated_text']}
34 |
35 | return prompt, prompt_data
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/BarChart.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { Bar } from 'react-chartjs-2';
3 | import { Chart as ChartJS, CategoryScale, LinearScale, BarElement, Title, Tooltip, Legend } from 'chart.js';
4 | import { API_TYPE_RATING, API_TYPE_SIMILARITY, API_TYPE_MULTITURN } from "@/services/Config";
5 | import ChartDataLabels from 'chartjs-plugin-datalabels';
6 |
7 | ChartJS.register(CategoryScale, LinearScale, BarElement, Title, Tooltip, Legend, ChartDataLabels);
8 |
9 | const BarChart = ({ gradeData, gradeType }) => {
10 | const totalCount = Object.values(gradeData).reduce((sum, count) => sum + count, 0);
11 |
12 | const mapGradeLabels = (label) => {
13 | const labelMaps = {
14 | [API_TYPE_RATING]: {
15 | '1': 'Incorrect',
16 | '2': 'Partially Correct',
17 | '3': 'Correct'
18 | },
19 | [API_TYPE_SIMILARITY]: {
20 | '0': 'Incorrect',
21 | '1': 'Correct'
22 | },
23 | [API_TYPE_MULTITURN]: {
24 | '0': 'Incorrect',
25 | '1': 'Correct'
26 | }
27 | };
28 |
29 | return labelMaps[gradeType]?.[label] || label;
30 | };
31 |
32 | const data = {
33 | labels: Object.keys(gradeData).map(mapGradeLabels),
34 | datasets: [
35 | {
36 | label: 'Count',
37 | data: Object.values(gradeData),
38 | backgroundColor: 'rgba(144, 202, 249, 0.6)',
39 | borderColor: 'rgba(144, 202, 249, 1)',
40 | borderWidth: 1,
41 | },
42 | ],
43 | };
44 |
45 | const options = {
46 | responsive: true,
47 | maintainAspectRatio: false,
48 | scales: {
49 | x: {
50 | title: {
51 | display: true,
52 | text: 'JudgeIt Score',
53 | font: {
54 | size: 14,
55 | weight: 'bold',
56 | },
57 | },
58 | },
59 | y: {
60 | title: {
61 | display: true,
62 | text: 'Count',
63 | font: {
64 | size: 14,
65 | weight: 'bold',
66 | },
67 | },
68 | beginAtZero: true,
69 | },
70 | },
71 | plugins: {
72 | tooltip: {
73 | callbacks: {
74 | label: (context) => {
75 | const count = context.raw;
76 | const percentage = ((count / totalCount) * 100).toFixed(2);
77 | return `Count: ${count} (${percentage}%)`;
78 | },
79 | },
80 | },
81 | datalabels: {
82 | color: 'black', // Label color
83 | anchor: 'end', // Positioning of the label
84 | align: 'top', // Align the label at the top
85 | font: {
86 | weight: 'bold',
87 | size: 12,
88 | },
89 | formatter: (value) => "Count: " + value, // Format the value as you want
90 | },
91 | },
92 | };
93 |
94 | return (
95 |
96 |
97 |
98 | );
99 | };
100 |
101 | export default BarChart;
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DisplayRequestHistorySingleTurn.jsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import { Grid, Paper, Box, CircularProgress } from "@mui/material";
3 |
4 | const DisplayRequestHistorySingleTurn = ({ serverData }) => {
5 | return (
6 | <>
7 |
8 |
14 |
15 |
16 | Experiment name:
17 |
18 |
19 | {serverData.experiment_name}
20 |
21 |
22 |
23 | Request type:
24 |
25 |
26 | {serverData.eval_type}
27 |
28 |
29 |
30 | Previous question:
31 |
32 |
33 | {serverData.content.query.previous_question}
34 |
35 |
36 |
37 | Previous answer:
38 |
39 |
40 | {serverData.content.query.previous_answer}
41 |
42 |
43 | Current question:
44 |
45 |
46 | {serverData.content.query.current_question}
47 |
48 |
49 | Golden rewritten question:
50 |
51 |
52 | {serverData.content.query.golden_rewritten_question}
53 |
54 |
55 | Rewritten question:
56 |
57 |
58 | {serverData.content.query.rewritten_question}
59 |
60 |
61 | Model:
62 |
63 |
64 | {serverData.content.query.model}
65 |
66 |
67 |
68 |
69 |
70 |
71 |
77 |
78 |
79 | Grade:
80 |
81 |
82 | {serverData.content.result.Grade || serverData.content.result.judgeit_score}
83 |
84 |
85 |
86 |
87 | >
88 | );
89 | };
90 |
91 | export default DisplayRequestHistorySingleTurn;
92 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/SingleTurnForm.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import React from "react";
3 | import { TextField, Box } from "@mui/material";
4 |
5 | const SingleTurnForm = ({
6 | values,
7 | handleChange,
8 | handleBlur,
9 | errors,
10 | touched,
11 | }) => {
12 | return (
13 |
14 |
15 |
25 |
26 |
27 |
39 |
40 |
41 |
51 |
52 |
53 |
65 |
66 |
67 |
79 |
80 |
81 | );
82 | };
83 |
84 | export default SingleTurnForm;
85 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DisplayRequestHistoryRatingSimilarity.jsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import { Grid, Box } from "@mui/material";
3 | import {
4 | API_TYPE_RATING,
5 | API_TYPE_SIMILARITY,
6 | grade_map_rating,
7 | grade_map_similarity,
8 | } from "@/services/Config";
9 |
10 | const DisplayRequestHistoryRatingSimilarity = ({ serverData }) => {
11 | return (
12 | <>
13 |
14 |
20 |
21 |
22 | Experiment name:
23 |
24 |
25 | {serverData.experiment_name}
26 |
27 |
28 |
29 | Request type:
30 |
31 |
32 | {serverData.eval_type}
33 |
34 |
35 | Question:
36 |
37 |
38 | {serverData.content.query.question}
39 |
40 |
41 | Golden Text:
42 |
43 |
44 | {serverData.content.query.golden_text}
45 |
46 |
47 |
48 | LLM Response:
49 |
50 |
51 | {serverData.content.query.generated_text}
52 |
53 |
54 | Model:
55 |
56 |
57 | {serverData.content.query.model}
58 |
59 |
60 |
61 |
62 |
63 |
64 |
70 |
71 |
72 | JudgeIt Score:
73 |
74 | {API_TYPE_RATING === serverData.eval_type && (
75 |
76 | {grade_map_rating[serverData.content.result.Grade]}
77 |
78 | )}
79 | {API_TYPE_SIMILARITY === serverData.eval_type && (
80 |
81 | {grade_map_similarity[serverData.content.result.Grade]}
82 |
83 | )}
84 |
85 |
86 | JudgeIt Reasoning:
87 |
88 |
89 | {serverData.content.result.Explanation}
90 |
91 |
92 |
93 |
94 | >
95 | );
96 | };
97 |
98 | export default DisplayRequestHistoryRatingSimilarity;
99 |
--------------------------------------------------------------------------------
/Framework/main.py:
--------------------------------------------------------------------------------
1 | from answer_similarity import batch_llm_answer_similarity
2 | from answer_rating import batch_llm_answer_rating
3 | from multi_turn_eval import batch_llm_multi_turn_eval
4 |
5 | import pandas as pd
6 | import json
7 | import configparser
8 |
9 | import chardet
10 |
11 | config = configparser.ConfigParser()
12 | config.read('./config.ini')
13 |
14 | ## Setup the filename and values
15 | home_dir = config['Default']['home_dir']
16 | input_file_name = config['Default']['input_file_name']
17 | output_file_name = config['Default']['output_file_name']
18 | model_id = config['Default']['model_id']
19 | judge_type = config['Default']['judge_type']
20 |
21 | input_file = home_dir + input_file_name
22 | output_file = home_dir + output_file_name
23 |
24 | def read_data(input_file):
25 | ## Read the data for batch processing
26 | data_df = pd.DataFrame()
27 | if '.xlsx' in input_file:
28 | data_df = pd.read_excel(input_file)
29 | elif '.csv' in input_file:
30 | with open(input_file, 'rb') as f:
31 | result = chardet.detect(f.read())
32 | data_df = pd.read_csv(input_file, encoding=result['encoding'])
33 | return data_df
34 |
35 | def write_data(data_df):
36 | ## save the output
37 | if '.xlsx' in output_file:
38 | # write the dataframe to an excel file
39 | writer = pd.ExcelWriter(output_file, engine='xlsxwriter')
40 | data_df.to_excel(writer, index=False, sheet_name='Sheet1')
41 | workbook = writer.book
42 | worksheet = writer.sheets['Sheet1']
43 | cell_format = workbook.add_format({'text_wrap': True, 'valign': 'top', 'align': 'left'})
44 | for i, column in enumerate(data_df.columns):
45 | worksheet.set_column(i, i, 40, cell_format)
46 | worksheet.set_column(3, 3, 70, cell_format)
47 | writer.close()
48 | elif '.csv' in output_file:
49 | data_df.to_csv(output_file)
50 | print("File saved in /JudgeIt-LLM-as-a-Judge/Framework/data/output")
51 |
52 |
53 | def batch_llm_multi_turn_eval_caller(input_file):
54 | input_data = read_data(input_file)
55 | output_data = batch_llm_multi_turn_eval(model_id, input_data)
56 | write_data(output_data)
57 | return output_data
58 |
59 | def batch_llm_answer_similarity_caller(input_file):
60 | input_data = read_data(input_file)
61 | output_data = batch_llm_answer_similarity(model_id, input_data)
62 | write_data(output_data)
63 | return output_data
64 |
65 | def batch_llm_answer_rating_caller(input_file):
66 | input_data = read_data(input_file)
67 | output_data = batch_llm_answer_rating(model_id, input_data)
68 | write_data(output_data)
69 | return output_data
70 |
71 | def processing(judge_type):
72 | if judge_type == 'multi_turn_eval':
73 | batch_llm_multi_turn_eval_caller(input_file)
74 | elif judge_type == 'rag_eval_answer_similarity':
75 | batch_llm_answer_similarity_caller(input_file)
76 | elif judge_type == 'rag_eval_answer_rating':
77 | batch_llm_answer_rating_caller(input_file)
78 |
79 |
80 |
81 |
82 | processing(judge_type)
83 | ## all options basis of tabs
84 | #processing('rating','batch')
85 | # processing('rating','simple')
86 | #processing('similarity','batch')
87 | #processing('similarity','simple')
88 | #processing('multi_turn')
--------------------------------------------------------------------------------
/REST-Service/app/src/services/LLMJudgeService.py:
--------------------------------------------------------------------------------
1 | from langchain_ibm import WatsonxLLM
2 | from app.src.services.answer_similarity import build_query_similarity_prompt
3 | from app.src.services.answer_rating import build_query_rating_prompt
4 | import json
5 | from app.src.services.single_turn_eval import build_single_turn_prompt
6 | from app.src.services.mult_turn_with_conversation_eval import build_multi_turn_prompt
7 |
8 | class LLMJudgeService:
9 |
10 | def __init__(self) -> None:
11 | pass
12 |
13 | def simple_processing_rating(self, golden_text: str, generated_text:str, llm_model: WatsonxLLM):
14 |
15 | prompt, prompt_data = build_query_rating_prompt(row={
16 | "golden_text": golden_text,
17 | "generated_text": generated_text
18 | })
19 |
20 | llm_chain = prompt | llm_model
21 | prompt_results = llm_chain.invoke(prompt_data)
22 | return json.loads(prompt_results)
23 |
24 | def simple_processing_similarity_answer(self, golden_text: str, generated_text:str, llm_model: WatsonxLLM):
25 |
26 | prompt, prompt_data = build_query_similarity_prompt(row={
27 | "golden_text": golden_text,
28 | "generated_text": generated_text
29 | })
30 |
31 | llm_chain = prompt | llm_model
32 |
33 | prompt_results = llm_chain.invoke(prompt_data)
34 | prompt_results = prompt_results.replace("\"1\" or \"0\"", "\"0\"")
35 | return json.loads(prompt_results)
36 |
37 | def single_trun_llm_judge(self,
38 | previous_question: str,
39 | previous_answer: str,
40 | current_question: str,
41 | golden_rewritten_question: str,
42 | rewritten_question: str,
43 | llm_model: WatsonxLLM):
44 |
45 | prompt, prompt_data = build_single_turn_prompt(row={
46 | "previous_question": previous_question,
47 | "previous_answer": previous_answer,
48 | "current_question": current_question,
49 | "golden_rewritten_question": golden_rewritten_question,
50 | "rewritten_question": rewritten_question
51 | })
52 | llm_chain = prompt | llm_model
53 | prompt_results = {"Grade": None}
54 | try:
55 | prompt_results = json.loads(llm_chain.invoke(prompt_data))
56 | except:
57 | prompt_results = prompt_results = {
58 | "Grade": "Error"
59 | }
60 |
61 | return prompt_results
62 |
63 | def multi_trun_llm_judge(self,
64 | conversation_history: str,
65 | follow_up_query: str,
66 | golden_query: str,
67 | rewritten_query: str,
68 | llm_model: WatsonxLLM):
69 |
70 | prompt, prompt_data = build_multi_turn_prompt(row={
71 | "conversation_history": conversation_history,
72 | "follow_up_query": follow_up_query,
73 | "golden_query": golden_query,
74 | "rewritten_query": rewritten_query
75 | })
76 | llm_chain = prompt | llm_model
77 | prompt_results = {"Grade": None}
78 | try:
79 | prompt_results = json.loads(llm_chain.invoke(prompt_data))
80 | except:
81 | prompt_results = prompt_results = {
82 | "Grade": "Error"
83 | }
84 |
85 | return prompt_results
86 |
87 |
--------------------------------------------------------------------------------
/REST-Service/app/src/services/answer_rating.py:
--------------------------------------------------------------------------------
1 | from langchain_core.prompts import PromptTemplate
2 |
3 | ## Grading a generated text compared to a golden text
4 | RATING_PROMPT = """Follow these structured steps to accurately assess the similarity between a Golden Text and a Generated Text:
5 | 1. **Role and Task**: Assume the role of an impartial assistant and evaluator. Your task is to assess the similarity between a Golden Text and a Generated Text using the provided information.
6 | 2. **Initial Setup**: Begin by carefully reviewing the Golden Text to understand the key information, entities, and intents it contains. The Golden Text is considered fully correct and comprehensive. Then, examine the Generated Text that needs evaluation.
7 | 3. **Evaluation Criteria**: Evaluate the Generated Text based on the following criteria:
8 | - Output {{"Grade": "1"}} if:
9 | a) The Generated Text is missing critical entities or intents that are present in the Golden Text.
10 | b) The Generated Text contains significant factual errors or contradictions when compared to the Golden Text.
11 | c) The overall meaning or intent of the Generated Text substantially differs from the Golden Text.
12 | - Output {{"Grade": "2"}} if:
13 | a) The Generated Text somewhat matches the Golden Text in terms of key entities and intents. Note that these may be worded differently but convey the same meaning.
14 | b) The Generated Text contains part of the essential information from the Golden Text, even if presented in a different order or with slight variations in phrasing.
15 | c) The Generated Text includes part the core information from the Golden Text and may contain additional relevant details or expansions that don't contradict the original.
16 | - Output {{"Grade": "3"}} if:
17 | a) The Generated Text matches the Golden Text closely in terms of key entities and intents. Note that these may be worded differently but convey the same meaning.
18 | b) The Generated Text contains all the essential information from the Golden Text, even if presented in a different order or with slight variations in phrasing.
19 | c) The Generated Text includes the core information from the Golden Text and may contain additional relevant details or expansions that don't contradict the original.
20 | 4. **Tolerance for Minor Differences**: Allow for minor differences in numerical values, slight variations in proper nouns, and small discrepancies in less critical details, as long as the core meaning and primary facts remain intact.
21 | 5. **Explanation**: After providing the grade, explain your reasoning in 1 sentence, highlighting key similarities or differences that influenced your decision.
22 | 6. **Output Format**: Format your evaluation output strictly as {{"Grade": "evaluated grade", "Explanation": "explanation for grade"}} to ensure clarity and consistency in assessment.
23 | Remember, the goal is to identify substantive similarity rather than expecting word-for-word matches. Focus on the core information, key facts, and overall intent when making your assessment.
24 |
25 | Input:
26 | Golden Text: {prompt_parameter_1}
27 | Generated Text: {prompt_parameter_2}
28 |
29 | Output:
30 | """
31 |
32 | def build_query_rating_prompt(row):
33 | input_variables = ['prompt_parameter_1', 'prompt_parameter_2']
34 | prompt = PromptTemplate(input_variables=input_variables, template=RATING_PROMPT)
35 | # create invoke parameter which is a dictionary of your prompt parameters
36 | prompt_data = {'prompt_parameter_1': row['golden_text'],
37 | 'prompt_parameter_2': row['generated_text']}
38 |
39 | return prompt, prompt_data
--------------------------------------------------------------------------------
/REST-Service/chart/templates/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: fastapi-app
5 | labels:
6 | app: fastapi-app
7 | spec:
8 | replicas: {{ .Values.replicaCount }}
9 | selector:
10 | matchLabels:
11 | app: fastapi-app
12 | template:
13 | metadata:
14 | labels:
15 | app: fastapi-app
16 | spec:
17 | containers:
18 | - name: fastapi-app
19 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
20 | imagePullPolicy: {{ .Values.image.pullPolicy }}
21 | ports:
22 | - containerPort: {{ .Values.service.fastapi.port }}
23 | env:
24 | - name: WATSONX_URL
25 | value: "{{ .Values.env.WATSONX_URL }}"
26 | - name: WX_PROJECT_ID
27 | value: "{{ .Values.env.WX_PROJECT_ID }}"
28 | - name: IBM_CLOUD_API_KEY
29 | value: "{{ .Values.env.IBM_CLOUD_API_KEY }}"
30 | - name: CELERY_BROKER_URL
31 | value: "{{ .Values.env.CELERY_BROKER_URL }}"
32 | - name: CELERY_RESULT_BACKEND
33 | value: "{{ .Values.env.CELERY_RESULT_BACKEND }}"
34 |
35 | ---
36 |
37 | apiVersion: apps/v1
38 | kind: Deployment
39 | metadata:
40 | name: redis
41 | labels:
42 | app: redis
43 | spec:
44 | replicas: {{ .Values.replicaCount }}
45 | selector:
46 | matchLabels:
47 | app: redis
48 | template:
49 | metadata:
50 | labels:
51 | app: redis
52 | spec:
53 | containers:
54 | - name: redis
55 | image: redis:7.2.5-alpine
56 | ports:
57 | - containerPort: {{ .Values.service.redis.port }}
58 |
59 | ---
60 |
61 | apiVersion: apps/v1
62 | kind: Deployment
63 | metadata:
64 | name: celery-worker
65 | labels:
66 | app: celery-worker
67 | spec:
68 | replicas: {{ .Values.replicaCount }}
69 | selector:
70 | matchLabels:
71 | app: celery-worker
72 | template:
73 | metadata:
74 | labels:
75 | app: celery-worker
76 | spec:
77 | containers:
78 | - name: celery-worker
79 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
80 | imagePullPolicy: {{ .Values.image.pullPolicy }}
81 | command: ["celery", "-A", "app.celery.celery_worker.celery", "worker", "--loglevel=info"]
82 | env:
83 | - name: WATSONX_URL
84 | value: "{{ .Values.env.WATSONX_URL }}"
85 | - name: WX_PROJECT_ID
86 | value: "{{ .Values.env.WX_PROJECT_ID }}"
87 | - name: IBM_CLOUD_API_KEY
88 | value: "{{ .Values.env.IBM_CLOUD_API_KEY }}"
89 | - name: CELERY_BROKER_URL
90 | value: "{{ .Values.env.CELERY_BROKER_URL }}"
91 | - name: CELERY_RESULT_BACKEND
92 | value: "{{ .Values.env.CELERY_RESULT_BACKEND }}"
93 |
94 | ---
95 |
96 | apiVersion: apps/v1
97 | kind: Deployment
98 | metadata:
99 | name: flower
100 | labels:
101 | app: flower
102 | spec:
103 | replicas: {{ .Values.replicaCount }}
104 | selector:
105 | matchLabels:
106 | app: flower
107 | template:
108 | metadata:
109 | labels:
110 | app: flower
111 | spec:
112 | containers:
113 | - name: flower
114 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
115 | imagePullPolicy: {{ .Values.image.pullPolicy }}
116 | command: ["celery", "--broker=redis://redis:6379/0", "flower", "--port=5555"]
117 | ports:
118 | - containerPort: {{ .Values.service.flower.port }}
119 | env:
120 | - name: CELERY_BROKER_URL
121 | value: "{{ .Values.env.CELERY_BROKER_URL }}"
122 | - name: CELERY_RESULT_BACKEND
123 | value: "{{ .Values.env.CELERY_RESULT_BACKEND }}"
124 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/DrawerMenu.jsx:
--------------------------------------------------------------------------------
1 | import { Box, Toolbar, Typography } from "@mui/material";
2 | import Drawer from "@mui/material/Drawer";
3 | import List from "@mui/material/List";
4 | import Divider from "@mui/material/Divider";
5 | import ListItem from "@mui/material/ListItem";
6 | import ListItemButton from "@mui/material/ListItemButton";
7 | import ListItemIcon from "@mui/material/ListItemIcon";
8 | import ListItemText from "@mui/material/ListItemText";
9 | import HomeOutlinedIcon from "@mui/icons-material/HomeOutlined";
10 | import LogoutOutlinedIcon from "@mui/icons-material/LogoutOutlined";
11 | import { signOut } from "next-auth/react";
12 | import GavelOutlinedIcon from "@mui/icons-material/GavelOutlined";
13 | import BatchPredictionOutlinedIcon from "@mui/icons-material/BatchPredictionOutlined";
14 | import HelpCenterOutlinedIcon from "@mui/icons-material/HelpCenterOutlined";
15 | import { app_labels_and_config } from "@/services/Config";
16 |
17 | const DrawerMenu = ({
18 | open,
19 | handleDrawwerOpen,
20 | handleDrawwerClose,
21 | handleLogout,
22 | }) => {
23 | const list = () => (
24 |
30 |
37 |
45 | {app_labels_and_config.app_title}
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 | {
95 | signOut({ callbackUrl: "/" });
96 | }}
97 | >
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 | );
107 |
108 | return (
109 |
110 | {list()}
111 |
112 | );
113 | };
114 |
115 | export default DrawerMenu;
116 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/LeftNavigation.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import { Sidebar, Menu, MenuItem, SubMenu } from "react-pro-sidebar";
3 | import HomeOutlinedIcon from "@mui/icons-material/HomeOutlined";
4 | import { Divider, Toolbar, Typography } from "@mui/material";
5 | import LoginOutlinedIcon from "@mui/icons-material/LoginOutlined";
6 | import LogoutOutlinedIcon from "@mui/icons-material/LogoutOutlined";
7 | import CreateNewFolderOutlinedIcon from "@mui/icons-material/CreateNewFolderOutlined";
8 | import Link from "next/link";
9 | import GavelOutlinedIcon from "@mui/icons-material/GavelOutlined";
10 | import BatchPredictionOutlinedIcon from "@mui/icons-material/BatchPredictionOutlined";
11 | import { useSession, signIn, signOut } from "next-auth/react";
12 | import { useEffect } from "react";
13 |
14 | function LeftNavBar() {
15 | const { data: session, status } = useSession();
16 |
17 | useEffect(() => {
18 | if (
19 | status != "loading" &&
20 | session &&
21 | session?.error === "RefreshAccessTokenError"
22 | ) {
23 | signOut({ callbackUrl: "/" });
24 | }
25 | }, [session, status]);
26 |
27 | return (
28 | <>
29 | {session && (
30 |
31 |
32 | LLM Judge
33 | {status === "loading" && (
34 | Loading..
35 | )}
36 | {session && (
37 |
38 | Logged in as {session.user.email}
39 |
40 | )}
41 |
42 |
43 |
55 | }
57 | component={ }
58 | >
59 | {" "}
60 | Home{" "}
61 |
62 |
63 | {session && (
64 | }>
65 | {session && (
66 | }
68 | component={ }
69 | >
70 | Single{" "}
71 |
72 | )}
73 | {session && (
74 | }
76 | component={ }
77 | >
78 | Batch{" "}
79 |
80 | )}
81 |
82 | )}
83 |
84 | {!session && (
85 | }
87 | onClick={() => signIn("auth0")}
88 | >
89 | Login
90 |
91 | )}
92 |
93 | {session && (
94 | }
96 | onClick={() => {
97 | signOut({ callbackUrl: "/" });
98 | }}
99 | >
100 | Logout
101 |
102 | )}
103 |
104 |
105 | )}
106 | >
107 | );
108 | }
109 |
110 | export default LeftNavBar;
111 |
--------------------------------------------------------------------------------
/JudgeIt-App/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # JudgeIt Application
5 |
6 | One method of using JudgeIt is through a Service-Oriented Architecture (SOA). This directory contains the code for a React-based application that provides a user interface for interacting with the LLM Judge service. It is built on the Next.js framework and integrates with IBM App ID for authentication. There are three types of evaluation currently available:
7 |
8 | 1. **RAG Evaluation (Similarity)**: evaluate generated text against golden text
9 | 2. **RAG Evaluation (Rating)**: evaluate generated text against golden text
10 | 3. **Multi-turn evaluation**: evaluate rewritten queries given a mult-turn conversation
11 |
12 | The JudgeIt framework takes input data in the form of excel or csv files for any of these evaluations.
13 |
14 | 
15 |
16 |
17 | ## Table of Contents
18 |
19 | - [Getting Started](#getting-started)
20 | - [Prerequisites](#prerequisites)
21 | - [Installation](#installation)
22 | - [Configuring your Input File](#configuring-your-input-file)
23 | - [Understanding the Results](#understanding-the-results)
24 |
25 |
26 |
27 | ## Getting Started
28 |
29 | ### Prerequisites
30 |
31 | The following prerequisites are required to run the tester:
32 |
33 | 1. [JudgeIt Backend REST Service](/REST-Service/README.md) is up and running
34 | 2. [Node.js](https://nodejs.org/en) v18 or higher
35 | 3. [IBM AppID](https://www.ibm.com/products/app-id) for application authentication
36 |
37 | ### Installation
38 |
39 | 1. Change directory into the JudgeIt App
40 |
41 | ```bash
42 | cd JudgeIt-LLM-as-a-Judge/JudgeIt-App
43 | ```
44 |
45 | 2. Copy env file to .env
46 |
47 | ```bash
48 | cp env .env
49 | ```
50 |
51 | 3. Configure your parameters in .env. Make sure `NEXT_PUBLIC_LLM_JUDGE_API_KEY` value matches with the value assigned in backend service.
52 |
53 | 4. Install dependencies
54 |
55 | ```bash
56 | npm install
57 | ```
58 |
59 | 5. Run the development server
60 |
61 | ```bash
62 | npm run dev
63 | ```
64 |
65 | 6. Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
66 |
67 | ## Configuring your Input File
68 |
69 | Each type of LLM Judge will accept an excel/csv file as an input file. The repository contains a sample input file for each type of LLM Judge that you can copy, edit, and use to test. They are located at: [JudgeIt-LLM-as-a-Judge/Framework/data/input](../Framework/data/input)
70 |
71 | 1. RAG Evaluation (Similarity): provide an excel/csv file with a `golden_text` column and `generated_text` column to compare
72 | 2. RAG Evaluation (Rating): provide an excel/csv file with a `golden_text` column and `generated_text` column to compare
73 | 3. Multi-turn Evaluation: provide an excel/csv file with the following columns: `previous_question`, `previous_answer`, `current_question`, `golden_rewritten_question`, and `rewritten_question`
74 |
75 | Note: Your input files can contain additional columns than the ones specified above. These columns will have no effect on the LLM Judge and will be perserved in the output file.
76 |
77 | ## Understanding the Results
78 |
79 | The generated results will be saved to an excel/csv file at the location specified in your config file. Each file will contain all the columns provided in the input file.
80 |
81 | 1. For RAG Evaluation (Similarity), the LLM Judge will output a `Grade` and `Explanation`. A grade of 0 means the texts are dissimilar, while a grade of 1 means the texts are similar.
82 | 2. For RAG Evaluation (Rating), the LLM Judge will output a `Grade` and `Explanation`. A grade of 1 means the texts are dissimilar, a grade of 2 means the texts are partially similar, and a text of 3 means the texts are significantly similar.
83 | 3. For Multi-turn Evaluation, the LLM Judge will output a `Grade`. A grade of 0 means the golden rewritten question and rewritten question are dissimilar, while a grade of 1 means the questions are similar.
84 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/EvaluationTypeComponent.jsx:
--------------------------------------------------------------------------------
1 | import {
2 | FormControl,
3 | FormHelperText,
4 | RadioGroup,
5 | FormControlLabel,
6 | Radio,
7 | FormLabel,
8 | } from "@mui/material";
9 | import {
10 | API_TYPE_MULTITURN,
11 | API_TYPE_RATING,
12 | API_TYPE_SIMILARITY,
13 | API_TYPE_WBOX_SDR,
14 | API_TYPE_BBOX_SDR,
15 | API_TYPE_KEY,
16 | API_TYPE_SINGLETURN,
17 | API_TYPE_AGENT,
18 | } from "@/services/Config";
19 | import EvaluationTypeLabel from "@/components/judge/EvaluationTypeLabel";
20 |
21 | const EvaluationTypeComponent = ({
22 | values,
23 | handleChange,
24 | handleBlur,
25 | errors,
26 | touched,
27 | api_call_inprogress
28 | }) => {
29 | return (
30 |
31 | {" "}
32 |
37 |
38 | Evaluation Type
39 |
40 |
48 | }
51 | label={
52 |
56 | }
57 | />
58 | }
61 | label={
62 |
66 | }
67 | />
68 | }
71 | label={
72 |
76 | }
77 | />
78 | }
81 | label={
82 |
86 | }
87 | />
88 | }
91 | label={
92 |
96 | }
97 | />
98 | }
101 | label={
102 |
106 | }
107 | />
108 | }
111 | label={
112 |
116 | }
117 | />
118 |
119 |
120 | {touched.apiType && errors.apiType && (
121 | {errors.apiType}
122 | )}
123 |
124 |
125 | );
126 | };
127 |
128 | export default EvaluationTypeComponent;
129 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/SingleInstructions.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import { Box, Typography, Button } from "@mui/material";
3 | import GavelOutlinedIcon from "@mui/icons-material/GavelOutlined";
4 |
5 | function SingleInstructions() {
6 | return (
7 |
8 |
15 |
16 | Single Answer Evaluation Instructions
17 |
18 |
19 |
26 | Evaluate a single input using different LLM Judge types.
27 |
28 |
29 |
30 |
38 | RAG Evaluation (Similarity):
39 |
40 |
41 |
42 | Function: Compare a golden text to a generated text
43 |
44 |
45 | Input: Provide the following:
46 |
47 |
48 |
49 | golden text
50 | generated text
51 |
52 |
53 | Output: The LLM Judge will output a Grade and Explanation.
54 | A grade of 0 means the texts are dissimilar, while a grade of 1
55 | means the texts are similar.
56 |
57 |
58 |
59 |
60 |
61 |
69 | RAG Evaluation (Rating):
70 |
71 |
72 |
73 | Function: Compare a golden text to a generated text
74 |
75 |
76 | Input: Provide the following:
77 |
78 |
79 | golden text
80 | generated text
81 |
82 |
83 | Output: The LLM Judge will output a Grade and Explanation.
84 | A grade of 1 means the texts are dissimilar, a grade of 2 means
85 | the texts are partially similar, and a text of 3 means the texts
86 | are significantly similar
87 |
88 |
89 |
90 |
91 |
92 |
100 | Multi-turn Evaluation:
101 |
102 |
103 |
104 | Function: Compare a golden rewritten query to a rewritten
105 | query based on a multi-turn conversation
106 |
107 |
108 | Input: Provide the following:
109 |
110 |
111 | previous question
112 | previous answer
113 | current question
114 | golden rewritten question
115 | rewritten question
116 |
117 |
118 | Output: The LLM Judge will output a Grade and Explanation.
119 | A grade of 0 means the texts are dissimilar, while a grade of 1
120 | means the texts are similar.
121 |
122 |
123 |
124 |
125 |
126 | Single Answer Evaluation
127 |
128 |
129 | );
130 | }
131 |
132 | export default SingleInstructions;
133 |
--------------------------------------------------------------------------------
/REST-Service/app/src/services/ManagementService.py:
--------------------------------------------------------------------------------
1 |
2 | from typing import Any, Dict
3 | from app.src.models.RequestHistory import RequestHistory
4 | from app.src.models.Experiment import Experiment
5 | from app.src.services.MongoService import MongoService
6 | from bson.json_util import dumps, loads
7 | from bson.objectid import ObjectId
8 |
9 | class ManagementService:
10 |
11 | def __init__(self, mongo_db: MongoService) -> None:
12 | self.experiment_collection = mongo_db.get_experiment_collection()
13 | self.history_collection = mongo_db.get_request_history_collection()
14 |
15 | def get_experiments(self, user_id):
16 | cursor = self.experiment_collection.find({ "user_id": user_id })
17 | experiments = [self.bson_to_dict(doc) for doc in cursor]
18 | return experiments
19 |
20 | def get_experiments_by_type(self, user_id: str, type: str):
21 | cursor = self.experiment_collection.find({ "user_id": user_id, "type": type })
22 | experiments = [self.bson_to_dict(doc) for doc in cursor]
23 | return experiments
24 |
25 | def get_experiment_by_name(self, user_id: str, name: str):
26 | cursor = self.experiment_collection.find_one({ "user_id": user_id, "name": name })
27 | if cursor is not None:
28 | return self.bson_to_dict(cursor)
29 | return None
30 |
31 | def get_experiment_by_name_and_type(self, user_id: str, name: str, type: str):
32 | cursor = self.experiment_collection.find_one({ "user_id": user_id, "name": name, "type": type })
33 | if cursor is not None:
34 | return self.bson_to_dict(cursor)
35 | return None
36 |
37 | def get_history_by_id(self, user_id: str, doc_id: str):
38 | object_id = ObjectId(doc_id)
39 | cursor = self.history_collection.find_one({"user_id": user_id, "_id": object_id})
40 | return self.bson_to_dict(cursor)
41 |
42 | def get_histories(self, user_id):
43 | cursor = self.history_collection.find({ "user_id": user_id })
44 | histories = [self.bson_to_dict(doc) for doc in cursor]
45 | return histories
46 |
47 | def get_histories_by_type(self, user_id: str, type: str):
48 | projection = {'content': 0}
49 | cursor = self.history_collection.find({ "user_id": user_id, "type": type }, projection )
50 | histories = [self.bson_to_dict(doc) for doc in cursor]
51 | return histories
52 |
53 | def get_histories_by_experiment_name(self, user_id, experiment_name):
54 | cursor = self.history_collection.find({ "user_id": user_id, "experiment_name": experiment_name })
55 | histories = [self.bson_to_dict(doc) for doc in cursor]
56 | return histories
57 |
58 | def get_histories_by_experiment_name_type(self, user_id: str, experiment_name: str, type: str):
59 | query: dict = { "user_id": user_id, "experiment_name": experiment_name, "type": type }
60 | cursor = self.history_collection.find(query)
61 | print("calling here", query)
62 | histories = [self.bson_to_dict(doc) for doc in cursor]
63 | return histories
64 |
65 | def add_experiment(self, experiment: Experiment):
66 | input = experiment.model_dump()
67 | insertion = self.experiment_collection.insert_one(input)
68 | return str(insertion.inserted_id)
69 |
70 | def add_history(self, request_history: RequestHistory) -> str:
71 | input = request_history.model_dump()
72 | insertion = self.history_collection.insert_one(input)
73 | return str(insertion.inserted_id)
74 |
75 | def delete_experiment(self, doc_id: str, user_id):
76 | object_id = ObjectId(doc_id)
77 | result = self.experiment_collection.delete_one({"_id": object_id, "user_id": user_id})
78 | return result.deleted_count
79 |
80 | def delete_experiment_by_name(self, experiment_name, user_id):
81 | ## Delete all document under experiment name in request history collection
82 | self.history_collection.delete_many({"experiment_name": experiment_name, "user_id": user_id})
83 | ## Delete from experiment collections
84 | result = self.experiment_collection.delete_one({"name": experiment_name, "user_id": user_id})
85 | return result.deleted_count
86 |
87 | def delete_history(self, doc_id: str, user_id:str):
88 | object_id = ObjectId(doc_id)
89 | result = self.history_collection.delete_one({"_id": object_id, "user_id": user_id})
90 | return result.deleted_count
91 |
92 | # Function to convert BSON document to a dictionary
93 | def bson_to_dict(self, bson_doc) -> Dict[str, Any]:
94 | # Convert ObjectId to string and return as dictionary
95 | doc = bson_doc.copy() # Create a copy to avoid modifying the original
96 | doc['_id'] = str(doc['_id']) # Convert ObjectId to string
97 | return doc
--------------------------------------------------------------------------------
/JudgeIt-App/app/pages/single/doc/[doc_id]/page.js:
--------------------------------------------------------------------------------
1 | "use client";
2 | import { useParams } from "next/navigation";
3 | import { useSession } from "next-auth/react";
4 | import { Grid, Box, Button, Typography, CircularProgress } from "@mui/material";
5 | import EvaluationHistoryLeftBar from "@/components/judge/EvaluationHistoryLeftBar";
6 | import { useEffect, useRef, useState } from "react";
7 | import { fetch_request_history_by_id } from "@/services/ManagemenBackendAPI";
8 | import {
9 | API_TYPE_SINGLETURN,
10 | API_TYPE_MULTITURN,
11 | API_TYPE_RATING,
12 | API_TYPE_SIMILARITY,
13 | } from "@/services/Config";
14 | import DisplayRequestHistoryRatingSimilarity from "@/components/judge/DisplayRequestHistoryRatingSimilarity";
15 | import DisplayRequestHistorySingleTurn from "@/components/judge/DisplayRequestHistorySingleTurn";
16 | import ArrowBackOutlinedIcon from "@mui/icons-material/ArrowBackOutlined";
17 | import Footer from "@/components/globals/Footer";
18 | import DisplayRequestHistoryMultiTurnConversation from "@/components/judge/DisplayRequestHistoryMultiTurn";
19 |
20 | const ItemPage = () => {
21 | const params = useParams();
22 | const { data: session, status } = useSession();
23 | const hasEffectRun = useRef(false);
24 | const [serverData, setServerData] = useState(null);
25 | const { doc_id } = params; // Get the 'id' from the URL
26 |
27 | useEffect(() => {
28 | if (hasEffectRun.current) {
29 | return; // Prevents the effect from running again
30 | }
31 |
32 | const fetch_data = async () => {
33 | const data = await fetch_request_history_by_id(
34 | session.user.email,
35 | doc_id
36 | );
37 | setServerData(data);
38 | };
39 |
40 | if (session?.user.email) {
41 | fetch_data();
42 | hasEffectRun.current = true;
43 | }
44 | }, [session?.user.email, doc_id]); // Empty dependency array, runs only once
45 |
46 | if (status === "loading") {
47 | return (
48 |
56 |
57 |
58 | );
59 | }
60 |
61 | return (
62 | <>
63 |
64 |
65 |
66 |
67 |
68 | {session && serverData && (
69 |
70 |
71 |
77 |
78 |
83 |
92 | Single Answer Evaluation: {serverData.name}
93 |
94 | }
98 | >
99 | Back
100 |
101 |
102 |
103 | {(API_TYPE_RATING === serverData.eval_type ||
104 | API_TYPE_SIMILARITY === serverData.eval_type) && (
105 |
108 | )}
109 | {API_TYPE_SINGLETURN === serverData.eval_type && (
110 |
111 | )}
112 | {API_TYPE_MULTITURN === serverData.eval_type && (
113 |
114 | )}
115 |
116 |
117 |
118 |
119 |
120 |
121 | )}
122 |
123 |
124 |
125 | >
126 | );
127 | };
128 |
129 | export default ItemPage;
130 |
--------------------------------------------------------------------------------
/JudgeIt-App/app/page.module.css:
--------------------------------------------------------------------------------
1 | .main {
2 | display: flex;
3 | flex-direction: column;
4 | justify-content: space-between;
5 | align-items: center;
6 |
7 | }
8 |
9 | .description {
10 | display: inherit;
11 | justify-content: inherit;
12 | align-items: inherit;
13 | font-size: 0.85rem;
14 | max-width: var(--max-width);
15 | width: 100%;
16 | z-index: 2;
17 | font-family: var(--font-mono);
18 | }
19 |
20 | .description a {
21 | display: flex;
22 | justify-content: center;
23 | align-items: center;
24 | gap: 0.5rem;
25 | }
26 |
27 | .description p {
28 | position: relative;
29 | margin: 0;
30 | padding: 1rem;
31 | background-color: rgba(var(--callout-rgb), 0.5);
32 | border: 1px solid rgba(var(--callout-border-rgb), 0.3);
33 | border-radius: var(--border-radius);
34 | }
35 |
36 | .code {
37 | font-weight: 700;
38 | font-family: var(--font-mono);
39 | }
40 |
41 | .grid {
42 | display: grid;
43 | grid-template-columns: repeat(4, minmax(25%, auto));
44 | max-width: 100%;
45 | width: var(--max-width);
46 | }
47 |
48 | .card {
49 | padding: 1rem 1.2rem;
50 | border-radius: var(--border-radius);
51 | background: rgba(var(--card-rgb), 0);
52 | border: 1px solid rgba(var(--card-border-rgb), 0);
53 | transition: background 200ms, border 200ms;
54 | }
55 |
56 | .card span {
57 | display: inline-block;
58 | transition: transform 200ms;
59 | }
60 |
61 | .card h2 {
62 | font-weight: 600;
63 | margin-bottom: 0.7rem;
64 | }
65 |
66 | .card p {
67 | margin: 0;
68 | opacity: 0.6;
69 | font-size: 0.9rem;
70 | line-height: 1.5;
71 | max-width: 30ch;
72 | text-wrap: balance;
73 | }
74 |
75 | .center {
76 | display: flex;
77 | justify-content: center;
78 | align-items: center;
79 | position: relative;
80 | padding: 4rem 0;
81 | }
82 |
83 | .center::before {
84 | background: var(--secondary-glow);
85 | border-radius: 50%;
86 | width: 480px;
87 | height: 360px;
88 | margin-left: -400px;
89 | }
90 |
91 | .center::after {
92 | background: var(--primary-glow);
93 | width: 240px;
94 | height: 180px;
95 | z-index: -1;
96 | }
97 |
98 | .center::before,
99 | .center::after {
100 | content: "";
101 | left: 50%;
102 | position: absolute;
103 | filter: blur(45px);
104 | transform: translateZ(0);
105 | }
106 |
107 | .logo {
108 | position: relative;
109 | }
110 | /* Enable hover only on non-touch devices */
111 | @media (hover: hover) and (pointer: fine) {
112 | .card:hover {
113 | background: rgba(var(--card-rgb), 0.1);
114 | border: 1px solid rgba(var(--card-border-rgb), 0.15);
115 | }
116 |
117 | .card:hover span {
118 | transform: translateX(4px);
119 | }
120 | }
121 |
122 | @media (prefers-reduced-motion) {
123 | .card:hover span {
124 | transform: none;
125 | }
126 | }
127 |
128 | /* Mobile */
129 | @media (max-width: 700px) {
130 | .content {
131 | padding: 4rem;
132 | }
133 |
134 | .grid {
135 | grid-template-columns: 1fr;
136 | margin-bottom: 120px;
137 | max-width: 320px;
138 | text-align: center;
139 | }
140 |
141 | .card {
142 | padding: 1rem 2.5rem;
143 | }
144 |
145 | .card h2 {
146 | margin-bottom: 0.5rem;
147 | }
148 |
149 | .center {
150 | padding: 8rem 0 6rem;
151 | }
152 |
153 | .center::before {
154 | transform: none;
155 | height: 300px;
156 | }
157 |
158 | .description {
159 | font-size: 0.8rem;
160 | }
161 |
162 | .description a {
163 | padding: 1rem;
164 | }
165 |
166 | .description p,
167 | .description div {
168 | display: flex;
169 | justify-content: center;
170 | position: fixed;
171 | width: 100%;
172 | }
173 |
174 | .description p {
175 | align-items: center;
176 | inset: 0 0 auto;
177 | padding: 2rem 1rem 1.4rem;
178 | border-radius: 0;
179 | border: none;
180 | border-bottom: 1px solid rgba(var(--callout-border-rgb), 0.25);
181 | background: linear-gradient(
182 | to bottom,
183 | rgba(var(--background-start-rgb), 1),
184 | rgba(var(--callout-rgb), 0.5)
185 | );
186 | background-clip: padding-box;
187 | backdrop-filter: blur(24px);
188 | }
189 |
190 | .description div {
191 | align-items: flex-end;
192 | pointer-events: none;
193 | inset: auto 0 0;
194 | padding: 2rem;
195 | height: 200px;
196 | background: linear-gradient(
197 | to bottom,
198 | transparent 0%,
199 | rgb(var(--background-end-rgb)) 40%
200 | );
201 | z-index: 1;
202 | }
203 | }
204 |
205 | /* Tablet and Smaller Desktop */
206 | @media (min-width: 701px) and (max-width: 1120px) {
207 | .grid {
208 | grid-template-columns: repeat(2, 50%);
209 | }
210 | }
211 |
212 | @media (prefers-color-scheme: dark) {
213 | .vercelLogo {
214 | filter: invert(1);
215 | }
216 |
217 | .logo {
218 | filter: invert(1) drop-shadow(0 0 0.3rem #ffffff70);
219 | }
220 | }
221 |
222 | @keyframes rotate {
223 | from {
224 | transform: rotate(360deg);
225 | }
226 | to {
227 | transform: rotate(0deg);
228 | }
229 | }
230 |
--------------------------------------------------------------------------------
/JudgeIt-App/services/ManagemenBackendAPI.js:
--------------------------------------------------------------------------------
1 | import axios from "axios";
2 | import {
3 | LLM_JUDGE_API_KEY_SECRET,
4 | LLM_JUDGE_MANAGEMENT_API_URL,
5 | } from "./Config";
6 |
7 | export async function create_experiment(payload, type) {
8 | if (payload.experiment_option === "new_experiment") {
9 | const headers = {
10 | accept: "application/json",
11 | "user-id": payload.user_id,
12 | LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
13 | "Content-Type": "application/json",
14 | };
15 |
16 | const url = LLM_JUDGE_MANAGEMENT_API_URL + "experiment";
17 |
18 | const data = {
19 | name: payload.new_experiment,
20 | user_id: payload.user_id,
21 | type: type,
22 | };
23 |
24 | try {
25 | await axios.post(url, data, { headers });
26 | } catch (error) {
27 | throw error;
28 | }
29 | }
30 | }
31 |
32 | export const fetch_experiment_list_by_type = async (user_id, type) => {
33 | const url = LLM_JUDGE_MANAGEMENT_API_URL + "histories/type/" + type;
34 |
35 | const headers = {
36 | accept: "application/json",
37 | "user-id": user_id,
38 | LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
39 | };
40 |
41 | try {
42 | const response = await axios.get(url, { headers });
43 | const data = response.data;
44 | const groupedData = data.reduce((result, item) => {
45 | const { experiment_name } = item;
46 |
47 | // If the experiment_name doesn't exist in result, initialize it with an empty array
48 | if (!result[experiment_name]) {
49 | result[experiment_name] = [];
50 | }
51 |
52 | // Push the current item into the corresponding experiment_name array
53 | result[experiment_name].push(item);
54 |
55 | return result;
56 | }, {});
57 |
58 | return groupedData;
59 | } catch (error) {
60 | console.error("Error fetching data:", error); // Handle any errors
61 | throw error;
62 | }
63 | };
64 |
65 | export const fetch_request_history_by_id = async (user_id, doc_id) => {
66 | const url = LLM_JUDGE_MANAGEMENT_API_URL + "histories/" + doc_id;
67 |
68 | const headers = {
69 | accept: "application/json",
70 | "user-id": user_id,
71 | LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
72 | };
73 |
74 | try {
75 | const response = await axios.get(url, { headers });
76 | const data = response.data;
77 | return data;
78 | } catch (error) {
79 | console.error("Error fetching fetch_request_history_by_id :", error); // Handle any errors
80 | throw error;
81 | }
82 | };
83 |
84 | export const fetch_request_history_by_name_and_type = async (
85 | user_id,
86 | experiment_name,
87 | type
88 | ) => {
89 | const url =
90 | LLM_JUDGE_MANAGEMENT_API_URL +
91 | "histories/name/" +
92 | experiment_name +
93 | "/type/" +
94 | type;
95 |
96 | const headers = {
97 | accept: "application/json",
98 | "user-id": user_id,
99 | LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
100 | };
101 |
102 | try {
103 | const response = await axios.get(url, { headers });
104 | const data = response.data;
105 | return data;
106 | } catch (error) {
107 | console.error(
108 | "Error fetching fetch_request_history_by_name_and_type :",
109 | error
110 | ); // Handle any errors
111 | throw error;
112 | }
113 | };
114 |
115 | export const get_experiment_list = async (user_id, type) => {
116 | const url = LLM_JUDGE_MANAGEMENT_API_URL + "experiments/type/" + type;
117 |
118 | const headers = {
119 | accept: "application/json",
120 | "user-id": user_id,
121 | LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
122 | };
123 |
124 | try {
125 | const response = await axios.get(url, { headers });
126 | const data = response.data;
127 | return data;
128 | } catch (error) {
129 | console.error("Error fetching get_experiment_list :", error); // Handle any errors
130 | throw error;
131 | }
132 | };
133 |
134 | export const delete_history_by_id = async (history_id, user_id) => {
135 | const headers = {
136 | accept: "application/json",
137 | "user-id": user_id,
138 | LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
139 | "Content-Type": "application/json",
140 | };
141 |
142 | try {
143 | const response = await axios.delete(
144 | LLM_JUDGE_MANAGEMENT_API_URL + "history/" + history_id,
145 | {
146 | headers: headers,
147 | }
148 | );
149 | return response.data;
150 | } catch (error) {
151 | console.error(
152 | "Error:",
153 | error.response ? error.response.data : error.message
154 | );
155 | }
156 | };
157 |
158 | export const delete_history_by_experiment_name = async (
159 | experiment_name,
160 | user_id
161 | ) => {
162 | const headers = {
163 | accept: "application/json",
164 | "user-id": user_id,
165 | LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
166 | "Content-Type": "application/json",
167 | };
168 |
169 | try {
170 | const response = await axios.delete(
171 | LLM_JUDGE_MANAGEMENT_API_URL + "experiment/name/" + experiment_name,
172 | {
173 | headers: headers,
174 | }
175 | );
176 | return response.data;
177 | } catch (error) {
178 | console.error(
179 | "Error:",
180 | error.response ? error.response.data : error.message
181 | );
182 | }
183 | };
184 |
--------------------------------------------------------------------------------
/evaluationapp-readme.md:
--------------------------------------------------------------------------------
1 | This addendum extends the original **JudgeIt Application** to support comprehensive evaluation of **agentic workflows** (e.g., SDR+ agents such as Comms, Research, Product, and Chrono).
2 | It introduces three complementary evaluation layers — **Blackbox**, **Whitebox**, and **Negative Testing** — to assess output quality, reasoning validity, and content safety.
3 |
4 | ## Evaluation done with the following methods:
5 | 1. Blackbox (Agent-level evaluation)
6 | 2. Whitebox (Workflow-level evaluation)
7 | 3. Negative testing
8 |
9 | ### Blackbox Evaluation (Agent-Level)
10 | **Goal:** Evaluate output quality and completeness of each SDR+ workflow agent — without inspecting internal logic.
11 |
12 | Each agent output is compared against a ground truth reference, and assigned a normalized score based on content inclusion, factual accuracy, and task completion.
13 |
14 | #### Score system:
15 | RAG Based Agents: 0–1
16 | 0: Output incomplete or factually inaccurate
17 | 1: Output complete, accurate, and aligned with ground truth (included all details from relevant tools)
18 | Multi-faceted Agents: 1–3
19 | 1: Poor clarity, off-tone, or missing key elements, funnels in correct context from previous agentic processes
20 | 2: Adequate but minor content issues
21 | 3: Excellent — clear, accurate, well-structured, and included all details
22 |
23 | #### Implementation focuses on the following questions:
24 | Did the agent include all expected content from the reference?
25 | Was the information factually correct and contextually relevant?
26 | Was the task or objective fully completed as intended?
27 | Was the structure and tone (for Comms) aligned with the workflow standards
28 |
29 | ### Whitebox evaluation implementation
30 | It essentially focuses on trace-based evaluation. Given a (user)query and agent thought trail which is generated by processing the user query, the implementation analyses the agent thought trail and returns a score of 0/1
31 |
32 | #### Score system:
33 | 0: agent thought trail has issues/ is not valid (not useful)
34 | 1: agent thought trail is working correctly is valid (useful)
35 |
36 | #### Implementation focuses on the following questions:
37 | 1. Was the flow valid? i.e., followed logical needed steps like thought, tool usage, thought, final answer etc
38 | 2. Were tools used by the agent?
39 | 3. Were the right tools used?
40 | 4. Errors seen
41 |
42 | ### Negative testing evaluation
43 | Makes use of watsonx.governance libraries to run HAP checks on all the content present - input and output of each agent
44 |
45 | #### Score system:
46 | 0: HAP/HARM/Unethical behavior etc content NOT found
47 | 1: HAP/HARM/Unethical behavior etc content found
48 |
49 | #### For cases where wx.gov cannot flag "negative" content: Developed LLM as Judge
50 | #### LLM as Judge Score system: (same as above)
51 | 0: content is clean
52 | 1: "negative" content found
53 |
54 | #### Implementation:
55 | Inorder to run the integrated codebase locally, you need to build it slightly different since the wx.gov libraries don't work well on macs:
56 |
57 | 1. Build the fastapi backend separately: `podman build --platform=linux/amd64 -t fastapi_app_image -f Dockerfile .`
58 | 2. Replace docker-compose yaml with the given compose yaml and add env variables under 'environment' sections:
59 | ```
60 | services:
61 | fastapi_app:
62 | container_name: fastapi_app
63 | platform: linux/amd64
64 | image: fastapi_app_image
65 | #volumes:
66 | # - ./app:/app
67 | ports:
68 | - 3001:3001
69 | environment:
70 | - WATSONX_URL=https://us-south.ml.cloud.ibm.com
71 | - WX_PROJECT_ID=
72 | - IBM_CLOUD_API_KEY=
73 | - LLM_JUDGE_API_KEY=
74 | - WX_PLATFORM=saas
75 | - WX_USER=''
76 | - WX_GOV_REGION=eu-de
77 | - CELERY_BROKER_URL=redis://redis:6379/0
78 | - CELERY_RESULT_BACKEND=redis://redis:6379/0
79 | - SERVER_URL=http://localhost:3001
80 | restart: always
81 |
82 | redis:
83 | container_name: redis
84 | image: redis:7.2.5-alpine
85 | restart: always
86 |
87 | celery_worker:
88 | container_name: celery_worker
89 | build: .
90 | #volumes:
91 | # - ./app:/app
92 | command: celery -A app.celery.celery_worker.celery worker --loglevel=info
93 | environment:
94 | - WATSONX_URL=https://us-south.ml.cloud.ibm.com
95 | - WX_PROJECT_ID=
96 | - WX_PLATFORM=saas
97 | - WX_USER=''
98 | - WX_GOV_REGION=eu-de
99 | - IBM_CLOUD_API_KEY=
100 | - CELERY_BROKER_URL=redis://redis:6379/0
101 | - CELERY_RESULT_BACKEND=redis://redis:6379/0
102 | depends_on:
103 | - fastapi_app
104 | - redis
105 | restart: always
106 |
107 | flower:
108 | container_name: flower
109 | build: .
110 | command: celery --broker=redis://redis:6379/0 flower --port=5555
111 | ports:
112 | - 5556:5555
113 | environment:
114 | - CELERY_BROKER_URL=redis://redis:6379/0
115 | - CELERY_RESULT_BACKEND=redis://redis:6379/0
116 | depends_on:
117 | - fastapi_app
118 | - redis
119 | - celery_worker
120 | restart: always
121 | ```
122 | 3. Run to build the rest of the services: `podman-compose build`
123 | 4. Run to get the services up and running: `podman-compose up -d`
124 | 5. Run to check if all 4 services are up and running: `podman-compose ps`
125 |
126 |
--------------------------------------------------------------------------------
/JudgeIt-App/services/Config.js:
--------------------------------------------------------------------------------
1 | export const APP_VERSION = "Alpha-1.0 version";
2 | //export const LLM_JUDGE_BASE_URL = "https://llm-judge-backend-llm-judge.roks-dsce2v-13d45cd84769aede38d625cd31842ee0-0000.us-south.containers.appdomain.cloud";
3 | export const LLM_JUDGE_BASE_URL = "http://localhost:3001";
4 | export const LLM_JUDGE_BATCH_EVENT_URL =
5 | LLM_JUDGE_BASE_URL + "/api/v1/judge/events/";
6 | export const LLM_JUDGE_DOWNLOAD_EVALUATION_URL =
7 | LLM_JUDGE_BASE_URL + "/api/v1/judge/download/";
8 | export const LLM_JUDGE_MANAGEMENT_API_URL =
9 | LLM_JUDGE_BASE_URL + "/api/v1/manage/";
10 |
11 | export const API_TYPE_KEY = "apiType";
12 | export const API_TYPE_RATING = "rating";
13 | export const API_TYPE_SIMILARITY = "similarity";
14 | export const API_TYPE_SINGLETURN = "singleturn";
15 | export const API_TYPE_MULTITURN = "multiturn";
16 | export const API_TYPE_WBOX_SDR = "whitebox_sdrflow";
17 | export const API_TYPE_BBOX_SDR = "blackbox_sdrflow";
18 | export const API_TYPE_AGENT = "agent_sdrflow";
19 |
20 | export const LLM_JUDGE_API_KEY_SECRET = "JudgeIt-Secret-Api-Key";
21 |
22 | export const LLM_MODELS = [
23 | /*
24 | {
25 | value: "MIXTRAL",
26 | label: "MIXTRAL"
27 | },
28 | {
29 | value: "GPT",
30 | label: "GPT"
31 | },
32 | */
33 | {
34 | value: "meta-llama/llama-3-3-70b-instruct",
35 | label: "llama-3-3-70b-instruct (Recommended)",
36 | },
37 | {
38 | value: "meta-llama/llama-3-3-70b-instruct",
39 | label: "llama-3-3-70b-instruct"
40 | },
41 | ];
42 |
43 | export const GITHUB_SOURCE_CODE =
44 | "https://github.com/ibm-ecosystem-engineering/JudgeIt-LLM-as-a-Judge";
45 | export const GITHUB_REPORT_ISSUE =
46 | "https://github.com/ibm-ecosystem-engineering/JudgeIt-LLM-as-a-Judge/issues";
47 |
48 | export const rag_similarity_display = [
49 | "Evaluate generated text against golden text and receive a binary score for similarity",
50 | "The LLM Judge will output a Grade and Explanation. A grade of 0 means the texts are dissimilar, while a grade of 1 means the texts are similar.",
51 | ];
52 |
53 | export const rag_rating_display = [
54 | "Evaluate generated text against golden text and receive a 1/2/3 rating based on degree of similarity",
55 | "The LLM Judge will output a Grade and Explanation. A grade of 1 means the texts are dissimilar, a grade of 2 means the texts are partially similar, and a text of 3 means the texts are significantly similar.",
56 | ];
57 |
58 | export const multi_turn_display = [
59 | "Evaluate rewritten queries given a mult-turn conversation and receive a binary score for similarity",
60 | "The LLM Judge will output a Grade. A grade of 0 means the golden rewritten question and rewritten question are dissimilar, while a grade of 1 means the questions are similar.",
61 | ];
62 |
63 | export const wbox_display = [
64 | "Evaluate generated agent thought trail and workflow execution on a 0/1 rating. 1 means the workflow is executing as expected; 0 means it does not.",
65 | "The LLM Judge will output a score.",
66 | ];
67 |
68 | export const bbox_display = [
69 | "Evaluate generated agent outputs against golden text. It evaluates Chrono, Product, and Research agents on 0/1 rating and Comms Agent on 1/2/3 rating based on degree of similarity",
70 | "The LLM Judge will output a Grade and Explanation. A grade of 1 means the texts are dissimilar, a grade of 2 means the texts are partially similar, and a text of 3 means the texts are significantly similar.",
71 | ];
72 |
73 | export const agent_display = [
74 | "Evaluate generated agent outputs for both black box (LLM-as-a-judge) as well as white box (workflow)",
75 | "The LLM Judge will output a set of grades for the different agents as well as for overall workflow.",
76 | ];
77 |
78 | export const grade_map_rating = {
79 | 1: "Incorrect",
80 | 2: "Partially correct",
81 | 3: "Correct",
82 | };
83 |
84 | export const grade_map_similarity = {
85 | 0: "Incorrect",
86 | 1: "Correct",
87 | };
88 |
89 | export const grade_map_multiturn = {
90 | 0: "Incorrect",
91 | 1: "Correct",
92 | };
93 |
94 |
95 | export const app_labels_and_config = {
96 | app_version: "Alpha-1.0 version",
97 | app_title: "JudgeIt",
98 | app_subtitle: "LLM as a Judge",
99 | logo_text: "Ecosystem Engineering",
100 | buttons: {
101 | single_page_action: "Single answer evaluation",
102 | batch_page_action: "Batch evaluation",
103 | },
104 | home_page_panel_title: {
105 | similarity_panel: "RAG Evaluation (Similarity)",
106 | rating_panel: "RAG Evaluation (Rating)",
107 | multiturn_panel: "Multi-turn evaluation",
108 | home_page_intro:
109 | "JudgeIt is an automated evaluation framework designed for testing various Generative AI pipelines such as RAG, Multi-Turn Query Rewriting, Text-to-SQL, and more. This service utilizes an LLM Judge to accurately and efficiently evaluate generated text against provided golden text. Try evaluating a single input or a batch of inputs by clicking one of the options below!",
110 | },
111 | pages: {
112 | batch_evaluation_page_title: "Batch Evaluation",
113 | single_evaluation_page_title: "Single Answer Evaluation",
114 | graph_title: "Grade Distribution",
115 | },
116 | github: "https://github.com/ibm-ecosystem-engineering/JudgeIt-LLM-as-a-Judge",
117 | github_issues:
118 | "https://github.com/ibm-ecosystem-engineering/JudgeIt-LLM-as-a-Judge/issues",
119 | };
120 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/BatchInstructions.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import { Box, Typography, Button } from "@mui/material";
3 | import BatchPredictionOutlinedIcon from "@mui/icons-material/BatchPredictionOutlined";
4 |
5 | function BatchInstructions() {
6 | return (
7 |
8 |
15 |
19 | Batch Instructions
20 |
21 |
22 |
29 | Each type of LLM Judge will accept an excel/csv file as an input file.
30 | The{" "}
31 |
36 | GitHub repository
37 | {" "}
38 | for this app contains a sample input file for each type of LLM Judge
39 | that you can copy, edit, and use to test.
40 |
41 |
42 |
43 |
51 | RAG Evaluation (Similarity):
52 |
53 |
54 |
55 | Function: Compare a golden text to a generated text
56 |
57 |
58 | Input: Provide an excel/csv file with the following
59 | columns:
60 |
61 |
62 |
63 | golden_text
64 | generated_text
65 |
66 |
67 | Output: The LLM Judge will output a Grade and Explanation.
68 | A grade of 0 means the texts are dissimilar, while a grade of 1
69 | means the texts are similar.
70 |
71 |
72 |
73 |
74 |
75 |
83 | RAG Evaluation (Rating):
84 |
85 |
86 |
87 | Function: Compare a golden text to a generated text
88 |
89 |
90 | Input: Provide an excel/csv file with the following
91 | columns:
92 |
93 |
94 | golden_text
95 | generated_text
96 |
97 |
98 | Output: The LLM Judge will output a Grade and Explanation.
99 | A grade of 1 means the texts are dissimilar, a grade of 2 means
100 | the texts are partially similar, and a text of 3 means the texts
101 | are significantly similar
102 |
103 |
104 |
105 |
106 |
107 |
115 | Multi-turn Evaluation:
116 |
117 |
118 |
119 | Function: Compare a golden rewritten query to a rewritten
120 | query based on a multi-turn conversation
121 |
122 |
123 | Input: Provide an excel/csv file with the following
124 | columns:
125 |
126 |
127 | previous_question
128 | previous_answer
129 | current_question
130 | golden_rewritten_question
131 | rewritten_question
132 |
133 |
134 | Output: The LLM Judge will output a Grade and Explanation.
135 | A grade of 0 means the texts are dissimilar, while a grade of 1
136 | means the texts are similar.
137 |
138 |
139 |
140 |
141 |
142 |
149 | Note: Your input files can contain additional columns than the
150 | ones specified above. These columns will have no effect on the LLM Judge
151 | and will be preserved in the output file.
152 |
153 |
154 | Batch Evaluation
155 |
156 |
157 | );
158 | }
159 |
160 | export default BatchInstructions;
161 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/Topbar.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import {
3 | Box,
4 | Typography,
5 | AppBar,
6 | Link,
7 | useMediaQuery,
8 | useTheme,
9 | Tooltip,
10 | IconButton,
11 | } from "@mui/material";
12 | import IBMIcon from "./icons/IBMIcon";
13 | import MenuOutlinedIcon from "@mui/icons-material/MenuOutlined";
14 | import DrawerMenu from "@/components/globals/DrawerMenu";
15 | import { useState } from "react";
16 | import { useSession } from "next-auth/react";
17 | import GitHubIcon from "@mui/icons-material/GitHub";
18 | import {
19 | app_labels_and_config,
20 | } from "@/services/Config";
21 |
22 | const Topbar = () => {
23 | const [drawerOpen, setDrawerOpen] = useState(false);
24 | const { data: session, status } = useSession();
25 | const theme = useTheme();
26 | const isSmallScreen = useMediaQuery(theme.breakpoints.down("sm"));
27 | const isMediumScreen = useMediaQuery(theme.breakpoints.between("sm", "md"));
28 |
29 | const getFontSize = () => {
30 | if (isSmallScreen) return "16px";
31 | if (isMediumScreen) return "18px";
32 | return "20px";
33 | };
34 |
35 | const handleDrawerOpen = () => {
36 | if (drawerOpen) setDrawerOpen(false);
37 | else setDrawerOpen(true);
38 | };
39 |
40 | const handleDrawerClose = (event) => {
41 | if (drawerOpen) setDrawerOpen(false);
42 | };
43 |
44 | return (
45 | <>
46 | {session && (
47 |
54 |
63 |
71 |
79 |
80 |
88 | {app_labels_and_config.logo_text}
89 |
90 |
91 |
92 |
98 |
104 | {app_labels_and_config.app_title}
105 |
106 |
110 | {app_labels_and_config.app_subtitle}
111 |
112 |
113 |
119 |
126 | Logged in as {session.user.email}
127 |
128 |
129 |
130 |
131 |
132 |
133 |
139 |
140 | {app_labels_and_config.app_version}
141 |
142 |
149 | Report an issue
150 |
151 |
152 |
161 |
166 |
167 |
168 |
169 | )}
170 | >
171 | );
172 | };
173 |
174 | export default Topbar;
175 |
--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/ExperimentForm.jsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | import React, { useState, useRef, useEffect } from "react";
3 | import {
4 | TextField,
5 | Box,
6 | FormControlLabel,
7 | RadioGroup,
8 | FormHelperText,
9 | Radio,
10 | FormControl,
11 | InputLabel,
12 | Select,
13 | MenuItem,
14 | Tooltip,
15 | } from "@mui/material";
16 | import { get_experiment_list } from "@/services/ManagemenBackendAPI";
17 | import { useSession } from "next-auth/react";
18 | import { getRandomInt } from "@/utils/Helper";
19 | import InfoOutlinedIcon from "@mui/icons-material/InfoOutlined";
20 |
21 | const ExperimentForm = ({
22 | values,
23 | handleChange,
24 | handleBlur,
25 | errors,
26 | touched,
27 | type,
28 | created_experiment,
29 | }) => {
30 | const [serverData, setServerData] = useState([]);
31 | const hasEffectRun = useRef(false);
32 | const { data: session, status } = useSession();
33 |
34 | useEffect(() => {
35 | if (hasEffectRun.current) {
36 | return; // Prevents the effect from running again
37 | }
38 |
39 | const fetch_data = async () => {
40 | const data = await get_experiment_list(session.user.email, type);
41 | setServerData(data);
42 | };
43 |
44 | if (session?.user.email) {
45 | fetch_data();
46 | hasEffectRun.current = true;
47 | }
48 | }, [session]); // Empty dependency array, runs only once
49 |
50 | useEffect(() => {
51 | if (created_experiment) {
52 | const newData = {
53 | name: created_experiment,
54 | };
55 | setServerData((prevData) => [...prevData, newData]);
56 | }
57 | }, [created_experiment]); // Trigger update when `result` changes
58 |
59 | return (
60 |
61 |
67 |
71 |
79 | }
82 | label="New Experiment"
83 | />
84 | }
87 | label="Select An Existing Experiment"
88 | />
89 |
90 | {touched.experiment_option && errors.experiment_option && (
91 | {errors.experiment_option}
92 | )}
93 |
94 |
98 |
99 |
100 |
101 | {values.experiment_option === "new_experiment" && (
102 |
103 |
113 |
117 |
118 |
119 |
120 | )}
121 | {values.experiment_option === "existing_experiment" && (
122 |
128 |
133 | Experiment
134 |
144 | {serverData.map((item, index) => (
145 |
149 | {item.name}
150 |
151 | ))}
152 |
153 | {touched.existing_experiment && errors.existing_experiment && (
154 | {errors.existing_experiment}
155 | )}
156 |
157 |
161 |
162 |
163 |
164 | )}
165 |
166 | );
167 | };
168 |
169 | export default ExperimentForm;
170 |
--------------------------------------------------------------------------------
/JudgeIt-App/services/JudgeBackendAPISolo.js:
--------------------------------------------------------------------------------
1 | import axios from "axios";
2 | import {
3 | API_TYPE_MULTITURN,
4 | API_TYPE_RATING,
5 | API_TYPE_SIMILARITY,
6 | LLM_JUDGE_BASE_URL,
7 | LLM_JUDGE_API_KEY_SECRET,
8 | LLM_JUDGE_MANAGEMENT_API_URL,
9 | API_TYPE_SINGLETURN,
10 | } from "./Config";
11 |
12 | import { create_experiment } from "./ManagemenBackendAPI";
13 | import { generateRandomString } from "@/utils/Helper";
14 |
15 | /* SOLO API ENDPOINTS */
16 | const API_RATING_URL = LLM_JUDGE_BASE_URL + "/api/v1/judge/rating";
17 | const API_SIMLARITY_URL = LLM_JUDGE_BASE_URL + "/api/v1/judge/similarity";
18 | const API_SINGLE_TURN_URL = LLM_JUDGE_BASE_URL + "/api/v1/judge/singleturn";
19 | const API_MULTITURN_URL = LLM_JUDGE_BASE_URL + "/api/v1/judge/multiturn";
20 |
21 | const config = {
22 | headers: {
23 | accept: "application/json",
24 | LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
25 | "Content-Type": "application/json",
26 | },
27 | };
28 |
29 | /** Single request call*/
30 | export async function judge_api_solo_call(payload) {
31 |
32 | try {
33 | if (payload.apiType === API_TYPE_RATING) {
34 | return await rating_api_call(payload);
35 | } else if (payload.apiType === API_TYPE_SIMILARITY) {
36 | return await similarity_api_call(payload);
37 | } else if (payload.apiType === API_TYPE_SINGLETURN) {
38 | return await single_turn_api_call(payload);
39 | } else if (payload.apiType === API_TYPE_MULTITURN) {
40 | return await multiturn_conversation_api_call(payload);
41 | }else {
42 | throw "API not found";
43 | }
44 | } catch (error) {
45 | throw error;
46 | } finally {
47 | }
48 | }
49 |
50 | async function save_request_history(payload, result) {
51 | const headers = {
52 | accept: "application/json",
53 | "Content-Type": "application/json",
54 | "user-id": payload.user_id,
55 | LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
56 | };
57 |
58 | const url = LLM_JUDGE_MANAGEMENT_API_URL + "history";
59 |
60 | const experiment_name =
61 | payload.experiment_option === "new_experiment"
62 | ? payload.new_experiment
63 | : payload.existing_experiment;
64 |
65 | let query = {};
66 | let name = payload.apiType + " - " + generateRandomString(4);
67 |
68 | if (payload.apiType === API_TYPE_SINGLETURN) {
69 | query = {
70 | model: payload.model,
71 | previous_question: payload.previous_question,
72 | previous_answer: payload.previous_answer,
73 | current_question: payload.current_question,
74 | golden_rewritten_question: payload.golden_rewritten_question,
75 | rewritten_question: payload.rewritten_question,
76 | };
77 | } else if (payload.apiType === API_TYPE_MULTITURN) {
78 | query = {
79 | model: payload.model,
80 | conversation_history: payload.conversation_history,
81 | follow_up_query: payload.follow_up_query,
82 | golden_query: payload.golden_query,
83 | rewritten_query: payload.rewritten_query
84 | };
85 | } else {
86 | query = {
87 | model: payload.model,
88 | question: payload.question,
89 | golden_text: payload.golden_text,
90 | generated_text: payload.generated_text,
91 | };
92 | }
93 |
94 | const content = {
95 | query: query,
96 | result: result,
97 | };
98 | const data = {
99 | name: name,
100 | user_id: payload.user_id,
101 | experiment_name: experiment_name,
102 | content: content,
103 | type: "single",
104 | eval_type: payload.apiType,
105 | };
106 |
107 | try {
108 | const response = await axios.post(url, data, { headers });
109 | data._id = response.data.insert_id;
110 | return data;
111 | } catch (error) {}
112 | }
113 |
114 | async function rating_api_call(payload) {
115 | try {
116 | const response = await axios.post(API_RATING_URL, payload, config);
117 |
118 | // creating new experiment after a successful call
119 | await create_experiment(payload, "single");
120 |
121 | // save the request
122 | const savedObject = await save_request_history(payload, response.data);
123 |
124 | return {
125 | query: savedObject,
126 | data: response.data,
127 | };
128 | } catch (error) {
129 | throw error;
130 | }
131 | }
132 |
133 | async function similarity_api_call(payload) {
134 | try {
135 | const response = await axios.post(API_SIMLARITY_URL, payload, config);
136 | // creating new experiment after a successful call
137 | await create_experiment(payload, "single");
138 |
139 | // save the request
140 | const savedObject = await save_request_history(payload, response.data);
141 |
142 | return {
143 | query: savedObject,
144 | data: response.data,
145 | };
146 | } catch (error) {
147 | throw error;
148 | }
149 | }
150 |
151 | async function single_turn_api_call(payload) {
152 | try {
153 | const response = await axios.post(API_SINGLE_TURN_URL, payload, config);
154 | // creating new experiment after a successful call
155 | await create_experiment(payload, "single");
156 |
157 | // save the request
158 | const savedObject = await save_request_history(payload, response.data);
159 |
160 | return {
161 | query: savedObject,
162 | data: response.data,
163 | };
164 | } catch (error) {
165 | throw error;
166 | }
167 | }
168 |
169 | async function multiturn_conversation_api_call(payload) {
170 | try {
171 | const response = await axios.post(API_MULTITURN_URL, payload, config);
172 | // creating new experiment after a successful call
173 | await create_experiment(payload, "single");
174 |
175 | console.log("#####################################################", payload);
176 |
177 | // save the request
178 | const savedObject = await save_request_history(payload, response.data);
179 |
180 | return {
181 | query: savedObject,
182 | data: response.data,
183 | };
184 | } catch (error) {
185 | throw error;
186 | }
187 | }
188 |
--------------------------------------------------------------------------------
/REST-Service/app/src/utils/Helper.py:
--------------------------------------------------------------------------------
1 | import io
2 | from fastapi import UploadFile
3 | import pandas as pd
4 |
5 | class Helper:
6 |
7 | def __init__(self) -> None:
8 | pass
9 |
10 | def read_data(self, file_name: str, file_content: bytes) -> pd.DataFrame:
11 |
12 | file_extension = file_name.split(".")[-1].lower()
13 | if file_extension not in ['xls', 'xlsx', 'csv']:
14 | raise Exception("Bad file types, accepted file types are xls, xlsx, and csv")
15 |
16 | ## Read the data for btach processing
17 | data_df = pd.DataFrame()
18 | file_stream = io.BytesIO(file_content)
19 | if '.xlsx' in file_name:
20 | data_df = pd.read_excel(file_stream)
21 | elif '.csv' in file_name:
22 | data_df =pd.read_csv(file_stream)
23 | return data_df
24 |
25 |
26 | def validate_single_turn_fields(self, data_df: pd.DataFrame):
27 |
28 | # Normalize the column names to lowercase
29 | data_df.columns = map(str.lower, data_df.columns)
30 |
31 | required_columns = ["previous_question", "previous_answer", "current_question", "golden_rewritten_question", "rewritten_question"]
32 |
33 | if all(column in data_df.columns for column in required_columns):
34 | return True
35 |
36 | columns = ", ".join(required_columns)
37 |
38 | raise Exception("Required columns are missing, valid columns are ## " + columns)
39 |
40 | def validate_multi_turn_with_conversation_fields(self, data_df: pd.DataFrame):
41 |
42 | # Normalize the column names to lowercase
43 | data_df.columns = map(str.lower, data_df.columns)
44 |
45 | required_columns = ["conversation_history", "follow_up_query", "golden_query", "rewritten_query"]
46 |
47 | if all(column in data_df.columns for column in required_columns):
48 | return True
49 |
50 | columns = ", ".join(required_columns)
51 |
52 | raise Exception("Required columns are missing, valid columns are ## " + columns)
53 |
54 | def validate_rating_and_similarity_fields(self, data_df: pd.DataFrame):
55 | # Normalize the column names to lowercase
56 | data_df.columns = map(str.lower, data_df.columns)
57 |
58 | # Define required columns in lowercase
59 | required_columns = ["question", "golden_text", "generated_text"]
60 |
61 | # Check if all required columns are present (case-insensitive)
62 | if all(column in data_df.columns for column in required_columns):
63 | return True
64 |
65 | columns = ", ".join(required_columns)
66 |
67 | raise Exception("Required columns are missing, valid columns are ## " + columns)
68 |
69 |
70 | def is_valid_file(file: UploadFile):
71 | filename = file.filename
72 | file_extension = filename.split(".")[-1].lower()
73 |
74 | if file_extension == 'csv' or file_extension in ['xls', 'xlsx']:
75 | return True
76 | else:
77 | return False
78 |
79 | # This code was added to handle the case when the columns produced by langfuse script had lower case o in the
80 | # Chrono Agent output field; This is not needed because we'll change all columns to title formant before sending
81 | # to whitebox eval
82 | def validate_wbox_eval_fields(self, data_df: pd.DataFrame):
83 |
84 | ## Data provided has it as "Chrono Agent output" instead of "Chrono Agent Output" so made that change here..
85 |
86 | required_columns = ["Chrono Agent output", "Product Agent Output", "Research Agent Output", "Comms Agent Output"]
87 |
88 | if all(column in data_df.columns for column in required_columns):
89 | return True
90 |
91 | columns = ", ".join(required_columns)
92 |
93 | raise Exception("Required columns are missing, valid columns are ## " + columns)
94 |
95 |
96 |
97 | # def validate_wbox_eval_fields(self, data_df: pd.DataFrame):
98 |
99 | # required_columns = ["Chrono Agent Output", "Product Agent Output", "Research Agent Output", "Comms Agent Output"]
100 |
101 | # if all(column in data_df.columns for column in required_columns):
102 | # return True
103 |
104 | # columns = ", ".join(required_columns)
105 |
106 | # raise Exception("Required columns are missing, valid columns are ## " + columns)
107 |
108 | def validate_bbox_eval_fields(self, data_df: pd.DataFrame):
109 |
110 | required_columns = ["Chrono Agent Output", "Product Agent Output", "Research Agent Output", "Comms Agent Output"]
111 |
112 | for col in required_columns:
113 | colfound = col in data_df.columns
114 | print(f"col {col} found: {colfound}")
115 |
116 | if all(column in data_df.columns for column in required_columns):
117 | return True
118 |
119 | columns = ", ".join(required_columns)
120 |
121 | raise Exception("Required columns are missing, valid columns are ## " + columns)
122 |
123 |
124 | def validate_neg_test_eval_fields(self, data_df: pd.DataFrame):
125 |
126 | required_columns = ["Research Agent Output", "Comms Agent Output"]
127 |
128 |
129 | if all(column in data_df.columns for column in required_columns):
130 | return True
131 |
132 | columns = ", ".join(required_columns)
133 |
134 | raise Exception("Required columns are missing, valid columns are ## " + columns)
135 |
136 |
137 | def validate_agent_eval_fields(self, data_df: pd.DataFrame):
138 |
139 | required_columns = ["Chrono Agent Output", "Product Agent Output", "Research Agent Output", "Comms Agent Output"]
140 |
141 | for col in required_columns:
142 | colfound = col in data_df.columns
143 | print(f"col {col} found: {colfound}")
144 |
145 | if all(column in data_df.columns for column in required_columns):
146 | return True
147 |
148 | columns = ", ".join(required_columns)
149 |
150 | raise Exception("Required columns are missing, valid columns are ## " + columns)
151 |
--------------------------------------------------------------------------------
/Framework/answer_similarity.py:
--------------------------------------------------------------------------------
1 | import json
2 | import configparser
3 | from langchain_ibm import WatsonxLLM
4 | from langchain_core.prompts import PromptTemplate
5 | import sys
6 |
7 | config = configparser.ConfigParser()
8 | config.read('./config.ini')
9 |
10 | ## Grading a generated text compared to a golden text
11 | SIMILARITY_PROMPT= """Follow these structured steps to accurately assess the similarity between a Golden Text and a Generated Text:
12 | 1. **Role and Task**: Assume the role of an impartial assistant and evaluator. Your task is to assess the similarity between a Golden Text and a Generated Text using the provided information.
13 | 2. **Initial Setup**: Begin by carefully reviewing the Golden Text to understand the key information, entities, and intents it contains. The Golden Text is considered fully correct and comprehensive. Then, examine the Generated Text that needs evaluation.
14 | 3. **Evaluation Criteria**: Evaluate the Generated Text based on the following criteria:
15 | - Output {{"Grade": "1"}} if:
16 | a) The Generated Text matches the Golden Text closely in terms of key entities and intents. Note that these may be worded differently but convey the same meaning contextually.
17 | b) The Generated Text contains all the essential information from the Golden Text, even if presented in a different order or with slight variations in phrasing.
18 | c) The Generated Text includes the core information from the Golden Text or may contain additional relevant, concise details or expansions that don't contradict the contextual meaning of the Golden Text.
19 | - Output {{"Grade": "0"}} if:
20 | a) The Generated Text is missing critical entities or intents that are present in the Golden Text.
21 | b) The Generated Text contains significant factual errors or contradictions when compared to the Golden Text.
22 | c) The overall meaning or intent of the Generated Text substantially differs from the Golden Text.
23 | 4. **Tolerance for Minor Differences**: Allow for minor differences in numerical values, slight variations in proper nouns, and small discrepancies in less critical details, as long as the core meaning and primary facts remain intact.
24 | 5. **Explanation**: After providing the grade, explain your reasoning in 1 sentence, highlighting key similarities or differences that influenced your decision.
25 | 6. **Output Format**: Format your evaluation output strictly as {{"Grade": "evaluated grade", "Explanation": "explanation for grade"}} to ensure clarity and consistency in assessment.
26 | Remember, the goal is to identify substantive similarity rather than expecting word-for-word matches. Focus on the core information, key facts, and overall intent when making your assessment.
27 |
28 | Input:
29 | Golden Text: {prompt_parameter_1}
30 | Generated Text: {prompt_parameter_2}
31 |
32 | Output:
33 | """
34 |
35 | def batch_llm_answer_similarity(model_id, input_data):
36 | # watsonx.ai credentials for llm judge
37 |
38 | # instantiate wml connection
39 | wml_credentials = {
40 | "url": config['WML_CRED']['wml_url'],
41 | "apikey": config['WML_CRED']['api_key']
42 | }
43 |
44 | project_id = config['WML_CRED']['project_id']
45 |
46 | llm_model_id = model_id
47 |
48 | # llm parameters
49 | generate_parameters_1 = {
50 | "decoding_method": "greedy",
51 | "min_new_tokens": 1,
52 | "max_new_tokens": 200,
53 | "repetition_penalty": 1,
54 | "stop_sequences": ['}']
55 | }
56 |
57 | platform = config['WML_CRED']['wml_platform']
58 | if platform == "saas":
59 | # instatiate llm
60 | llm_model = WatsonxLLM(apikey=wml_credentials['apikey'],
61 | url=wml_credentials['url'],
62 | project_id=project_id,
63 | model_id=llm_model_id,
64 | params=generate_parameters_1)
65 | elif platform == "onpremise":
66 | wml_user = config['WML_CRED']['wml_user']
67 | llm_model = WatsonxLLM(apikey=wml_credentials['apikey'],
68 | url=wml_credentials['url'],
69 | model_id=llm_model_id,
70 | username=wml_user,
71 | instance_id='openshift',
72 | project_id=project_id,
73 | version="5.0",
74 | params=generate_parameters_1)
75 | else:
76 | raise Exception("Please set a correct value in config.ini [WML_CRED][wml_platform], correct values are `onpremise` or `saas` ")
77 |
78 | input_data['Grade'] = None
79 | input_data['Explanation'] = None
80 |
81 | for index, row in input_data.iterrows():
82 | input_variables = ['prompt_parameter_1', 'prompt_parameter_2']
83 | prompt = PromptTemplate(input_variables=input_variables, template=SIMILARITY_PROMPT)
84 | llm_chain = prompt | llm_model
85 | # create invoke parameter which is a dictionary of your prompt parameters
86 | try:
87 | prompt_data = {'prompt_parameter_1': row['golden_text'],
88 | 'prompt_parameter_2': row['generated_text']}
89 | except KeyError as e:
90 | print(f"Error: Missing required column - {e}")
91 | print("Input file requires the following columns:")
92 | print("1) golden_text")
93 | print("2) generated_text")
94 | try:
95 | prompt_results = json.loads(llm_chain.invoke(prompt_data))
96 | except:
97 | prompt_results = 'Error generating results'
98 |
99 | if prompt_results == 'Error generating results':
100 | input_data.at[index,'Grade'] = 'Error'
101 | input_data.at[index,'Explanation'] = 'Error'
102 | else:
103 | input_data.at[index,'Grade'] = int(prompt_results['Grade'])
104 | input_data.at[index,'Explanation'] = prompt_results['Explanation']
105 | input_string = f"Golden Text: {prompt_data['prompt_parameter_1']}\n\nGenerated Text: {prompt_data['prompt_parameter_2']}"
106 | print(f'-------------testing input {index + 1}-------------\n')
107 | print(f'1) Input:\n\n{input_string}\n\n')
108 | print(f'2) Output:\n\n{prompt_results}\n\n')
109 |
110 | return input_data
111 |
--------------------------------------------------------------------------------