├── REST-Service
    ├── app
    │   ├── __init__.py
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── models
    │   │   │   ├── Experiment.py
    │   │   │   ├── RequestHistory.py
    │   │   │   ├── LLMInput.py
    │   │   │   ├── MultiTurnInput.py
    │   │   │   └── SingleTurnInput.py
    │   │   ├── config
    │   │   │   └── TimeoutMiddleware.py
    │   │   ├── services
    │   │   │   ├── MongoService.py
    │   │   │   ├── WatsonXService.py
    │   │   │   ├── answer_similarity.py
    │   │   │   ├── LLMJudgeService.py
    │   │   │   ├── answer_rating.py
    │   │   │   └── ManagementService.py
    │   │   └── utils
    │   │   │   └── Helper.py
    │   └── route
    │   │   └── root
    │   │       └── routes.py
    ├── deployment
    │   ├── base
    │   │   ├── redis
    │   │   │   ├── kustomization.yaml
    │   │   │   ├── service.yaml
    │   │   │   └── deployment.yaml
    │   │   ├── celery-worker
    │   │   │   ├── kustomization.yaml
    │   │   │   ├── service.yaml
    │   │   │   └── deployment.yaml
    │   │   ├── flower
    │   │   │   ├── kustomization.yaml
    │   │   │   ├── route.yaml
    │   │   │   ├── service.yaml
    │   │   │   └── deployment.yaml
    │   │   ├── rest-app
    │   │   │   ├── kustomization.yaml
    │   │   │   ├── route.yaml
    │   │   │   ├── service.yaml
    │   │   │   ├── deployment.yaml
    │   │   │   └── secret.yaml
    │   │   └── kustomization.yaml
    │   └── readme.md
    ├── chart
    │   ├── Chart.yaml
    │   ├── values.yaml
    │   └── templates
    │   │   ├── service.yaml
    │   │   └── deployment.yaml
    ├── Dockerfile
    ├── requirements.txt
    ├── cert
    │   └── mongo.crt
    ├── main.py
    └── docker-compose.yml
├── JudgeIt-App
    ├── .eslintrc.json
    ├── jsconfig.json
    ├── next.config.js
    ├── app
    │   ├── favicon.ico
    │   ├── (auth)
    │   │   └── signin
    │   │   │   └── page.js
    │   ├── api
    │   │   └── auth
    │   │   │   ├── [...nextauth]
    │   │   │       └── route.js
    │   │   │   └── logout
    │   │   │       └── route.js
    │   ├── layout.js
    │   ├── pages
    │   │   ├── help
    │   │   │   └── page.js
    │   │   └── single
    │   │   │   └── doc
    │   │   │       └── [doc_id]
    │   │   │           └── page.js
    │   ├── globals.css
    │   └── page.module.css
    ├── env
    ├── utils
    │   ├── setDynamicRoute.js
    │   ├── sessionProviderWrapper.js
    │   ├── encryption.js
    │   ├── sessionTokenAccessor.js
    │   └── Helper.js
    ├── components
    │   ├── globals
    │   │   ├── PageTitle.jsx
    │   │   ├── DataGridToolbar.jsx
    │   │   ├── LinearProgressWithLabel.jsx
    │   │   ├── Footer.jsx
    │   │   ├── icons
    │   │   │   ├── IBMIconTop.jsx
    │   │   │   └── IBMIcon.jsx
    │   │   ├── SignIn.jsx
    │   │   ├── DeleteConfirmationDialog.jsx
    │   │   ├── BarChart.jsx
    │   │   ├── DrawerMenu.jsx
    │   │   ├── LeftNavigation.jsx
    │   │   ├── SingleInstructions.jsx
    │   │   ├── BatchInstructions.jsx
    │   │   └── Topbar.jsx
    │   └── judge
    │   │   ├── EvaluationTypeLabel.jsx
    │   │   ├── MultiTurnWithConversationForm.jsx
    │   │   ├── SoloResult.jsx
    │   │   ├── RatingSimilarityDataGrid.jsx
    │   │   ├── DataGridMultiTurnConversation.jsx
    │   │   ├── DataGridMultiTurnSummary.jsx
    │   │   ├── RatingSimilarityDataGridSummary.jsx
    │   │   ├── DataGridSingleTurn.jsx
    │   │   ├── DataGridSingleTurnSummary.jsx
    │   │   ├── DisplayRequestHistoryMultiTurn.jsx
    │   │   ├── RatingSimilarityForm.jsx
    │   │   ├── DisplayRequestHistorySingleTurn.jsx
    │   │   ├── SingleTurnForm.jsx
    │   │   ├── DisplayRequestHistoryRatingSimilarity.jsx
    │   │   ├── EvaluationTypeComponent.jsx
    │   │   └── ExperimentForm.jsx
    ├── deployment
    │   ├── readme.md
    │   └── deployment.yaml
    ├── .gitignore
    ├── Dockerfile
    ├── package.json
    ├── styles
    │   └── globals.css
    ├── public
    │   ├── next.svg
    │   └── vercel.svg
    ├── README.md
    └── services
    │   ├── ManagemenBackendAPI.js
    │   ├── Config.js
    │   └── JudgeBackendAPISolo.js
├── .gitignore
├── images
    ├── features.png
    ├── swagger-ui.png
    ├── LLM-Judge-App.png
    ├── flow-diagram.png
    ├── llm-judge-app-saas.png
    ├── LLM-Judge_framework.png
    ├── multiturn-app-batch.gif
    ├── multiturn-framework.gif
    ├── LLM-Judge-app-onpremise.png
    ├── RAG-reliability-testing.png
    ├── multi-turn-evaluation.png
    ├── LLM-judge-framework-saas.png
    ├── rest-service-architecture.png
    ├── rag-evaluation-reliability.png
    ├── LLM-Judge-Architecture-Backend.png
    ├── llm-judge-framework-onpremise.png
    └── multi-turn-evaluation-reliability.png
├── Framework
    ├── data
    │   ├── input
    │   │   ├── sample_multi_turn_input.xlsx
    │   │   ├── sample_rag_answer_rating_input.xlsx
    │   │   └── sample_rag_answer_similarity_input.xlsx
    │   └── output
    │   │   ├── sample_multi_turn_output.xlsx
    │   │   ├── sample_rag_answer_rating_output.xlsx
    │   │   └── sample_rag_answer_similarity_output.xlsx
    ├── config.ini
    ├── sample_config.ini
    ├── wml_setup.py
    ├── main.py
    └── answer_similarity.py
├── requirements.txt
└── evaluationapp-readme.md


/REST-Service/app/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/REST-Service/app/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/JudgeIt-App/.eslintrc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": "next/core-web-vitals"
3 | }
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | .DS_Store
3 | __pycache__
4 | config.ini
5 | virtual-env
6 | 


--------------------------------------------------------------------------------
/images/features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/features.png


--------------------------------------------------------------------------------
/images/swagger-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/swagger-ui.png


--------------------------------------------------------------------------------
/JudgeIt-App/jsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 |   "compilerOptions": {
3 |     "paths": {
4 |       "@/*": ["./*"]
5 |     }
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/images/LLM-Judge-App.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/LLM-Judge-App.png


--------------------------------------------------------------------------------
/images/flow-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/flow-diagram.png


--------------------------------------------------------------------------------
/JudgeIt-App/next.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('next').NextConfig} */
2 | const nextConfig = {}
3 | 
4 | module.exports = nextConfig


--------------------------------------------------------------------------------
/JudgeIt-App/app/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/JudgeIt-App/app/favicon.ico


--------------------------------------------------------------------------------
/REST-Service/deployment/base/redis/kustomization.yaml:
--------------------------------------------------------------------------------
1 | kind: Kustomization
2 | resources:
3 |   - deployment.yaml
4 |   - service.yaml
5 | 


--------------------------------------------------------------------------------
/images/llm-judge-app-saas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/llm-judge-app-saas.png


--------------------------------------------------------------------------------
/images/LLM-Judge_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/LLM-Judge_framework.png


--------------------------------------------------------------------------------
/images/multiturn-app-batch.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/multiturn-app-batch.gif


--------------------------------------------------------------------------------
/images/multiturn-framework.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/multiturn-framework.gif


--------------------------------------------------------------------------------
/REST-Service/deployment/base/celery-worker/kustomization.yaml:
--------------------------------------------------------------------------------
1 | kind: Kustomization
2 | resources:
3 |   - deployment.yaml
4 |   - service.yaml
5 | 


--------------------------------------------------------------------------------
/images/LLM-Judge-app-onpremise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/LLM-Judge-app-onpremise.png


--------------------------------------------------------------------------------
/images/RAG-reliability-testing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/RAG-reliability-testing.png


--------------------------------------------------------------------------------
/images/multi-turn-evaluation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/multi-turn-evaluation.png


--------------------------------------------------------------------------------
/REST-Service/chart/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: mychart
3 | description: A Helm chart for Kubernetes
4 | version: 0.1.0
5 | appVersion: "1.0"
6 | 


--------------------------------------------------------------------------------
/images/LLM-judge-framework-saas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/LLM-judge-framework-saas.png


--------------------------------------------------------------------------------
/images/rest-service-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/rest-service-architecture.png


--------------------------------------------------------------------------------
/REST-Service/deployment/base/flower/kustomization.yaml:
--------------------------------------------------------------------------------
1 | kind: Kustomization
2 | resources:
3 |   - deployment.yaml
4 |   - service.yaml
5 |   - route.yaml
6 | 


--------------------------------------------------------------------------------
/images/rag-evaluation-reliability.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/rag-evaluation-reliability.png


--------------------------------------------------------------------------------
/REST-Service/deployment/base/rest-app/kustomization.yaml:
--------------------------------------------------------------------------------
1 | kind: Kustomization
2 | resources:
3 |   - deployment.yaml
4 |   - service.yaml
5 |   - route.yaml
6 | 


--------------------------------------------------------------------------------
/images/LLM-Judge-Architecture-Backend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/LLM-Judge-Architecture-Backend.png


--------------------------------------------------------------------------------
/images/llm-judge-framework-onpremise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/llm-judge-framework-onpremise.png


--------------------------------------------------------------------------------
/JudgeIt-App/app/(auth)/signin/page.js:
--------------------------------------------------------------------------------
1 | import SignIn from '@/components/globals/SignIn'
2 | 
3 | export default function SignInPage() {
4 |   return <SignIn />;
5 | }


--------------------------------------------------------------------------------
/images/multi-turn-evaluation-reliability.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/images/multi-turn-evaluation-reliability.png


--------------------------------------------------------------------------------
/Framework/data/input/sample_multi_turn_input.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/Framework/data/input/sample_multi_turn_input.xlsx


--------------------------------------------------------------------------------
/Framework/data/output/sample_multi_turn_output.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/Framework/data/output/sample_multi_turn_output.xlsx


--------------------------------------------------------------------------------
/Framework/data/input/sample_rag_answer_rating_input.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/Framework/data/input/sample_rag_answer_rating_input.xlsx


--------------------------------------------------------------------------------
/Framework/data/output/sample_rag_answer_rating_output.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/Framework/data/output/sample_rag_answer_rating_output.xlsx


--------------------------------------------------------------------------------
/Framework/data/input/sample_rag_answer_similarity_input.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/Framework/data/input/sample_rag_answer_similarity_input.xlsx


--------------------------------------------------------------------------------
/REST-Service/app/src/models/Experiment.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | from pydantic import BaseModel, Field
3 | 
4 | class Experiment(BaseModel):
5 |     name: str
6 |     user_id: str
7 |     type: str


--------------------------------------------------------------------------------
/Framework/data/output/sample_rag_answer_similarity_output.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibm-self-serve-assets/JudgeIt-LLM-as-a-Judge/HEAD/Framework/data/output/sample_rag_answer_similarity_output.xlsx


--------------------------------------------------------------------------------
/JudgeIt-App/env:
--------------------------------------------------------------------------------
1 | NEXT_PUBLIC_JUDGE_BACKEND_URL=<http://localhost:3001>
2 | NEXT_PUBLIC_LLM_JUDGE_API_KEY=
3 | OAUTH_ISSUER_URL=
4 | OAUTH_CLIENT_ID=
5 | OAUTH_CLIENT_SECRET=
6 | NEXTAUTH_URL=<http://localhost:3000>
7 | NEXTAUTH_SECRET=
8 | 


--------------------------------------------------------------------------------
/REST-Service/app/src/models/RequestHistory.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | from pydantic import BaseModel, Field
 3 | 
 4 | class RequestHistory(BaseModel):
 5 |     name: str
 6 |     user_id: str
 7 |     experiment_name: str
 8 |     content: Any
 9 |     type: str
10 |     eval_type: str


--------------------------------------------------------------------------------
/REST-Service/app/src/models/LLMInput.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | from pydantic import BaseModel, Field
 3 | 
 4 | class LLMInput(BaseModel):
 5 |     question: str
 6 |     golden_text: str
 7 |     generated_text: str
 8 |     model: str = "meta-llama/llama-3-70b-instruct"
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/REST-Service/app/src/models/MultiTurnInput.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | from pydantic import BaseModel
3 | 
4 | class MultiTurnInput(BaseModel):
5 |     conversation_history: str
6 |     follow_up_query: str
7 |     golden_query: str
8 |     rewritten_query: str
9 |     model: str = "meta-llama/llama-3-70b-instruct"


--------------------------------------------------------------------------------
/JudgeIt-App/utils/setDynamicRoute.js:
--------------------------------------------------------------------------------
 1 | 'use client';
 2 | 
 3 | import { useEffect } from 'react';
 4 | import { useRouter } from 'next/navigation';
 5 | 
 6 | export function SetDynamicRoute() {
 7 |   const router = useRouter();
 8 | 
 9 |   useEffect(() => {
10 |     router.refresh();
11 |   }, [router]);
12 | 
13 |   return <></>;
14 | }


--------------------------------------------------------------------------------
/JudgeIt-App/utils/sessionProviderWrapper.js:
--------------------------------------------------------------------------------
 1 | 'use client';
 2 | import React from 'react'
 3 | 
 4 | import { SessionProvider } from 'next-auth/react';
 5 | 
 6 | const SessionProviderWrapper = ({children}) => {
 7 |   return (
 8 |     <SessionProvider>{children}</SessionProvider>
 9 |   )
10 | }
11 | 
12 | export default SessionProviderWrapper


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/PageTitle.jsx:
--------------------------------------------------------------------------------
 1 | import {Box, Typography} from '@mui/material'
 2 | 
 3 | const PageTitle = ({ title }) => {
 4 |   return (
 5 |     <Box justifyContent={"center"} display={"flex"} marginBottom={'25px'}>
 6 |       <Typography variant="h6">{title}</Typography>
 7 |     </Box>
 8 |   );
 9 | };
10 | 
11 | export default PageTitle;
12 | 


--------------------------------------------------------------------------------
/REST-Service/app/src/models/SingleTurnInput.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | from pydantic import BaseModel
 3 | 
 4 | class SingleTurnInput(BaseModel):
 5 |     previous_question: str
 6 |     previous_answer: str
 7 |     current_question: str
 8 |     golden_rewritten_question: str
 9 |     rewritten_question: str
10 |     model: str = "meta-llama/llama-3-70b-instruct"


--------------------------------------------------------------------------------
/Framework/config.ini:
--------------------------------------------------------------------------------
 1 | [Default]
 2 | home_dir = path_to_JudgeIt_repository
 3 | model_id = llm_model_id
 4 | input_file_name = input_path_and_filename
 5 | output_file_name = input_path_and_filename
 6 | judge_type = judge_type
 7 | 
 8 | [WML_CRED]
 9 | wml_platform = saas
10 | wml_user = watsonx.user
11 | wml_url = watsonx.ai_url
12 | api_key = ibm_cloud_api_key
13 | project_id = watsonx.ai_project_id


--------------------------------------------------------------------------------
/REST-Service/deployment/base/redis/service.yaml:
--------------------------------------------------------------------------------
 1 | kind: Service
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: redis
 5 |   labels:
 6 |     app: redis
 7 | spec:
 8 |   ports:
 9 |     - name: 6379-tcp
10 |       protocol: TCP
11 |       port: 6379
12 |       targetPort: 6379
13 |   internalTrafficPolicy: Cluster
14 |   type: ClusterIP
15 |   selector:
16 |     app: redis
17 |     deployment: redis
18 | 
19 | 


--------------------------------------------------------------------------------
/REST-Service/deployment/base/flower/route.yaml:
--------------------------------------------------------------------------------
 1 | kind: Route
 2 | apiVersion: route.openshift.io/v1
 3 | metadata:
 4 |   name: flower-app
 5 |   labels:
 6 |     app: flower-app
 7 | spec:
 8 |   to:
 9 |     kind: Service
10 |     name: flower-app
11 |     weight: 100
12 |   port:
13 |     targetPort: 5555-tcp
14 |   tls:
15 |     termination: edge
16 |     insecureEdgeTerminationPolicy: Redirect
17 |   wildcardPolicy: None
18 |   


--------------------------------------------------------------------------------
/REST-Service/deployment/base/rest-app/route.yaml:
--------------------------------------------------------------------------------
 1 | kind: Route
 2 | apiVersion: route.openshift.io/v1
 3 | metadata:
 4 |   name: llm-judge-backend
 5 |   labels:
 6 |     app: llm-judge-backend
 7 | spec:
 8 |   to:
 9 |     kind: Service
10 |     name: llm-judge-backend
11 |     weight: 100
12 |   port:
13 |     targetPort: 3001-tcp
14 |   tls:
15 |     termination: edge
16 |     insecureEdgeTerminationPolicy: Redirect
17 |   wildcardPolicy: None
18 | 


--------------------------------------------------------------------------------
/REST-Service/deployment/base/celery-worker/service.yaml:
--------------------------------------------------------------------------------
 1 | kind: Service
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: celery-worker
 5 |   labels:
 6 |     app: celery-worker
 7 | spec:
 8 |   ports:
 9 |     - name: 3001-tcp
10 |       protocol: TCP
11 |       port: 3001
12 |       targetPort: 3001
13 |     - name: 8080-tcp
14 |       protocol: TCP
15 |       port: 8080
16 |       targetPort: 8080
17 |   selector:
18 |     app: celery-worker
19 |     deployment: celery-worker


--------------------------------------------------------------------------------
/REST-Service/deployment/base/flower/service.yaml:
--------------------------------------------------------------------------------
 1 | kind: Service
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: flower-app
 5 |   labels:
 6 |     app: flower-app 
 7 | spec:
 8 |   ports:
 9 |     - name: 3001-tcp
10 |       protocol: TCP
11 |       port: 3001
12 |       targetPort: 3001
13 |     - name: 5555-tcp
14 |       protocol: TCP
15 |       port: 5555
16 |       targetPort: 5555
17 |   type: ClusterIP
18 |   selector:
19 |     app: flower-app
20 |     deployment: flower-app
21 | 


--------------------------------------------------------------------------------
/JudgeIt-App/deployment/readme.md:
--------------------------------------------------------------------------------
 1 | # LLM-Judge frontend
 2 | 
 3 | Make sure you deploy the Backend REST service first.
 4 | 
 5 | ```yaml
 6 | oc create secret generic llmjudge-frontend-secret \
 7 | --from-literal=JUDGE_BACKEND_URL='' \
 8 | --from-literal=LLM_JUDGE_API_KEY='' \
 9 | --from-literal=NEXTAUTH_SECRET='' \
10 | --from-literal=NEXTAUTH_URL='' \
11 | --from-literal=OAUTH_CLIENT_ID='' \
12 | --from-literal=OAUTH_ISSUER_URL=''
13 | ```
14 | 
15 | ```sh
16 | oc apply -f deployment.yaml
17 | ```


--------------------------------------------------------------------------------
/REST-Service/deployment/base/rest-app/service.yaml:
--------------------------------------------------------------------------------
 1 | kind: Service
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: llm-judge-backend
 5 |   labels:
 6 |     app: llm-judge-backend
 7 | spec:
 8 |   ports:
 9 |     - name: 3001-tcp
10 |       protocol: TCP
11 |       port: 3001
12 |       targetPort: 3001
13 |     - name: 8080-tcp
14 |       protocol: TCP
15 |       port: 8080
16 |       targetPort: 8080
17 |   type: ClusterIP
18 |   selector:
19 |     app: llm-judge-backend
20 |     deployment: llm-judge-backend
21 | 


--------------------------------------------------------------------------------
/Framework/sample_config.ini:
--------------------------------------------------------------------------------
 1 | [Default]
 2 | home_dir = /<home_directory>/JudgeIt-LLM-as-a-Judge/
 3 | model_id = meta-llama/llama-3-1-70b-instruct
 4 | input_file_name = Framework/data/input/sample_rag_answer_similarity_input.xlsx
 5 | output_file_name = Framework/data/output/sample_rag_answer_similarity_output.xlsx
 6 | judge_type = rag_eval_answer_similarity
 7 | 
 8 | [WML_CRED]
 9 | wml_platform = saas
10 | wml_user = ''
11 | wml_url = https://us-south.ml.cloud.ibm.com
12 | api_key = ibm_cloud_api_key
13 | project_id = watsonx.ai_project_id


--------------------------------------------------------------------------------
/JudgeIt-App/utils/encryption.js:
--------------------------------------------------------------------------------
 1 | import Cryptr from "cryptr";
 2 | 
 3 | export function encrypt(text) {
 4 |   const secretKey = process.env.NEXTAUTH_SECRET;
 5 |   const cryptr = new Cryptr(secretKey);
 6 | 
 7 |   const encryptedString = cryptr.encrypt(text);
 8 |   return encryptedString; 
 9 | }
10 | 
11 | export function decrypt(encryptedString) {
12 |     const secretKey = process.env.NEXTAUTH_SECRET;
13 |     const cryptr = new Cryptr(secretKey);
14 |   
15 |     const text = cryptr.decrypt(encryptedString);
16 |     return text;
17 |   }


--------------------------------------------------------------------------------
/JudgeIt-App/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 2 | 
 3 | # dependencies
 4 | /node_modules
 5 | /.pnp
 6 | .pnp.js
 7 | .yarn/install-state.gz
 8 | 
 9 | # testing
10 | /coverage
11 | 
12 | # next.js
13 | /.next/
14 | /out/
15 | 
16 | # production
17 | /build
18 | 
19 | # misc
20 | .DS_Store
21 | *.pem
22 | 
23 | # debug
24 | npm-debug.log*
25 | yarn-debug.log*
26 | yarn-error.log*
27 | 
28 | # local env files
29 | .env*.local
30 | 
31 | # vercel
32 | .vercel
33 | 
34 | # typescript
35 | *.tsbuildinfo
36 | next-env.d.ts
37 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/DataGridToolbar.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import {
 3 |   GridToolbarContainer,
 4 |   GridToolbarColumnsButton,
 5 |   GridToolbarFilterButton,
 6 |   GridToolbarDensitySelector,
 7 |   GridToolbarExport
 8 | } from "@mui/x-data-grid";
 9 | 
10 | const DataGridToolbar = () => {
11 | 
12 |   return (
13 |     <GridToolbarContainer>
14 |       <GridToolbarColumnsButton />
15 |       <GridToolbarFilterButton />
16 |       <GridToolbarDensitySelector />
17 |       <GridToolbarExport />
18 |     </GridToolbarContainer>
19 |   );
20 | };
21 | export default DataGridToolbar;


--------------------------------------------------------------------------------
/REST-Service/chart/values.yaml:
--------------------------------------------------------------------------------
 1 | replicaCount: 1
 2 | 
 3 | image:
 4 |   repository: <your-docker-image>
 5 |   tag: "latest"
 6 |   pullPolicy: IfNotPresent
 7 | 
 8 | service:
 9 |   fastapi:
10 |     type: ClusterIP
11 |     port: 3001
12 |   redis:
13 |     type: ClusterIP
14 |     port: 6379
15 |   flower:
16 |     type: ClusterIP
17 |     port: 5555
18 | 
19 | env:
20 |   WATSONX_URL: "https://us-south.ml.cloud.ibm.com"
21 |   WX_PROJECT_ID: ""
22 |   IBM_CLOUD_API_KEY: ""
23 |   CELERY_BROKER_URL: "redis://redis:6379/0"
24 |   CELERY_RESULT_BACKEND: "redis://redis:6379/0"
25 | 
26 | resources: {}
27 | 


--------------------------------------------------------------------------------
/JudgeIt-App/app/api/auth/[...nextauth]/route.js:
--------------------------------------------------------------------------------
 1 | import NextAuth from "next-auth";
 2 | import Auth0Provider from "next-auth/providers/auth0";
 3 | 
 4 | export const authOptions = {
 5 |   providers: [
 6 |     Auth0Provider({
 7 |         issuer: `${process.env.OAUTH_ISSUER_URL}`,
 8 |         clientId:`${process.env.OAUTH_CLIENT_ID}`,
 9 |         clientSecret: `${process.env.OAUTH_CLIENT_SECRET}`,
10 |         id: 'IBMid',
11 |         name: 'IBMid',
12 |       }),
13 |     ],
14 |     pages: {
15 |       signIn: "/signin"
16 |     }
17 |   }
18 |   
19 | const handler =  NextAuth(authOptions);
20 | 
21 | export { handler as GET, handler as POST };


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/EvaluationTypeLabel.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import { Tooltip } from "@mui/material";
 3 | import InfoOutlinedIcon from "@mui/icons-material/InfoOutlined";
 4 | 
 5 | const EvaluationTypeLabel = ({ label, tooltip }) => {
 6 |   return (
 7 |     <div style={{ display: "flex", flexDirection: "row" }}>
 8 |       <span>{label}</span>
 9 |       <Tooltip
10 |         title={tooltip}
11 |         sx={{
12 |           cursor: "help",
13 |           marginLeft: "5px",
14 |         }}
15 |       >
16 |         <InfoOutlinedIcon />
17 |       </Tooltip>
18 |     </div>
19 |   );
20 | };
21 | 
22 | export default EvaluationTypeLabel;
23 | 


--------------------------------------------------------------------------------
/REST-Service/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official Selenium Standalone Chrome image as the base image
 2 | FROM registry.access.redhat.com/ubi8/python-311:latest
 3 | 
 4 | # Set the working directory inside the container
 5 | WORKDIR /app/backend
 6 | 
 7 | # Copy the requirements file to the container and install dependencies
 8 | COPY requirements.txt requirements.txt
 9 | RUN pip3 install -r requirements.txt
10 | RUN pip3 install bson
11 | RUN pip3 install pymongo
12 | 
13 | # Copy your FastAPI Python script to the container
14 | COPY main.py main.py
15 | COPY app/ app/
16 | COPY cert/ cert/
17 | 
18 | EXPOSE 3001
19 | 
20 | # Set the command to run your Python script
21 | CMD ["python3", "main.py"]


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/LinearProgressWithLabel.jsx:
--------------------------------------------------------------------------------
 1 | import LinearProgress from '@mui/material/LinearProgress';
 2 | import Typography from '@mui/material/Typography';
 3 | import Box from '@mui/material/Box';
 4 | 
 5 | export default function LinearProgressWithLabel({ value, width }) {
 6 |   return (
 7 |     <Box sx={{ display: 'flex', alignItems: 'center' }}>
 8 |       <Box sx={{ width: width, mr: 1 }}>
 9 |         <LinearProgress variant="determinate" value={value} />
10 |       </Box>
11 |       <Box sx={{ minWidth: 35 }}>
12 |         <Typography variant="body2" color="text.secondary">{`${Math.round(
13 |           value,
14 |         )}%`}</Typography>
15 |       </Box>
16 |     </Box>
17 |   );
18 | }


--------------------------------------------------------------------------------
/JudgeIt-App/utils/sessionTokenAccessor.js:
--------------------------------------------------------------------------------
 1 | import { getServerSession } from "next-auth";
 2 | import { authOptions } from "../app/api/auth/[...nextauth]/route";
 3 | import { decrypt } from "./encryption";
 4 | 
 5 | export async function getAccessToken() {
 6 | 
 7 |   const session = await getServerSession(authOptions);  
 8 |   if(session){    
 9 |     const accessTokenDecrypted = decrypt(session.access_token)    
10 |     return accessTokenDecrypted;
11 |   }
12 |   return null;
13 | }
14 | 
15 | export async function getIdToken() {
16 | 
17 |   const session = await getServerSession(authOptions);  
18 |   if(session){    
19 |     const idTokenDecrypted = decrypt(session.id_token)    
20 |     return idTokenDecrypted;
21 |   }
22 |   return null;
23 | }


--------------------------------------------------------------------------------
/REST-Service/app/route/root/routes.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, requests as request
 2 | from fastapi.responses import HTMLResponse
 3 | 
 4 | root_api_route = APIRouter()
 5 | 
 6 | API_PREFIX = "/"
 7 | ## This routes returns the text to SQL from a given context and a sql query
 8 | @root_api_route.get(API_PREFIX)
 9 | def root_api():
10 |     return HTMLResponse(
11 |         """
12 |         <html>
13 |                 <head>
14 |                     <title>LLM Judge service</title>
15 |                 </head>
16 |                 <body>
17 |                     <h1>LLM Judge service!</h1>
18 |                     <h3>For complete API visit open API <a href="/docs">docs</a></h3>
19 |                 </body>
20 |         </html>
21 |         """
22 |     )


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2024.07.04
 2 | chardet==5.2.0
 3 | charset-normalizer==3.3.2
 4 | click==8.1.7
 5 | ibm-cos-sdk==2.13.5
 6 | ibm-cos-sdk-core==2.13.5
 7 | ibm-cos-sdk-s3transfer==2.13.5
 8 | ibm_watsonx_ai==1.0.10
 9 | idna==3.7
10 | importlib_metadata==8.0.0
11 | jmespath==1.0.1
12 | joblib==1.4.2
13 | langchain-ibm==0.1.12
14 | lomond==0.3.3
15 | nltk==3.8.1
16 | numpy==1.26.4
17 | openpyxl==3.1.5
18 | packaging==24.1
19 | pandas==2.1.4
20 | python-dateutil==2.9.0.post0
21 | pytz==2024.1
22 | regex==2024.5.15
23 | requests==2.32.3
24 | rouge==1.0.1
25 | scikit-learn==1.5.0
26 | scipy==1.14.0
27 | six==1.16.0
28 | tabulate==0.9.0
29 | threadpoolctl==3.5.0
30 | tqdm==4.66.4
31 | tzdata==2024.1
32 | urllib3==2.1.0
33 | XlsxWriter==3.2.0
34 | zipp==3.19.2
35 | 


--------------------------------------------------------------------------------
/REST-Service/deployment/base/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | kind: Kustomization
 2 | images:
 3 |   - name: backend-image-name
 4 |     newName: image-registry.openshift-image-registry.svc:5000/llm-judge-dev/backend
 5 |     newTag: v1.0
 6 | secretGenerator:
 7 | - name: llm-judge-secret
 8 |   literals:
 9 |     - WATSONX_URL=
10 |     - WX_PROJECT_ID=
11 |     - IBM_CLOUD_API_KEY=
12 |     - LLM_JUDGE_API_KEY=JudgeIt-Secret-Api-Key
13 |     - WX_PLATFORM=saas
14 |     - WX_USER=
15 |     - CELERY_BROKER_URL=redis://redis:6379/0
16 |     - CELERY_RESULT_BACKEND=redis://redis:6379/0
17 |     - SERVER_URL=
18 |     - MONGO_URL=
19 |     - MONGO_USER=
20 |     - MONGO_PASS=
21 |     - MONGO_DB="judgeit_app"
22 | resources:
23 |   - redis/
24 |   - celery-worker/
25 |   - flower/
26 |   - rest-app/


--------------------------------------------------------------------------------
/JudgeIt-App/app/api/auth/logout/route.js:
--------------------------------------------------------------------------------
 1 | import { authOptions } from "../[...nextauth]/route";
 2 | import { getServerSession } from "next-auth"
 3 | import { getIdToken } from "@/utils/sessionTokenAccessor";
 4 | 
 5 | export async function GET() {
 6 |   const session = await getServerSession(authOptions);
 7 | 
 8 |   if (session) {
 9 | 
10 |     const idToken = await getIdToken();
11 | 
12 |     // this will log out the user on Keycloak side
13 |     var url = `${process.env.END_SESSION_URL}?id_token_hint=${idToken}&post_logout_redirect_uri=${encodeURIComponent(process.env.NEXTAUTH_URL)}`;
14 | 
15 |     try {
16 |       const resp = await fetch(url, { method: "GET" });
17 |     } catch (err) {
18 |       console.error(err);
19 |       return new Response({ status: 500 });
20 |     }
21 |   }
22 |   return new Response({ status: 200 });
23 | }


--------------------------------------------------------------------------------
/JudgeIt-App/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM node:22.4.1-alpine AS deps
 2 | #RUN apk add --no-cache libc6-compat=1.2.4-r2
 3 | WORKDIR /app
 4 | 
 5 | COPY package.json ./
 6 | COPY package-lock.json ./
 7 | RUN  npm install
 8 | 
 9 | FROM node:22.4.1-alpine AS builder
10 | WORKDIR /app
11 | COPY --from=deps /app/node_modules ./node_modules
12 | COPY . .
13 | 
14 | RUN npm run build
15 | 
16 | FROM node:22.4.1-alpine AS runner
17 | WORKDIR /app
18 | 
19 | ENV NODE_ENV production
20 | ENV NEXT_TELEMETRY_DISABLED 1
21 | 
22 | RUN addgroup --system --gid 1001 nodejs
23 | RUN adduser --system --uid 1001 nextjs
24 | 
25 | COPY --from=builder --chown=nextjs:nodejs /app/.next ./.next
26 | COPY --from=builder /app/node_modules ./node_modules
27 | COPY --from=builder /app/package.json ./package.json
28 |  
29 | USER nextjs
30 | 
31 | EXPOSE 3000
32 | 
33 | ENV PORT 3000
34 | 
35 | CMD ["npm", "start"]


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/Footer.jsx:
--------------------------------------------------------------------------------
 1 | function Footer() {
 2 |   return (
 3 |     <footer
 4 |       style={{
 5 |         bottom: 15,
 6 |         height: "40px",
 7 |         fontSize: 11,
 8 |         zIndex: 1,
 9 |         textAlign: "center",
10 |       }}
11 |     >
12 |       <div className="container">
13 |         <p style={{ margin: "20px" }}>
14 |           Disclaimer - Please note that this content is made available to foster
15 |           Embedded AI technology adoption. The content may include systems &
16 |           methods pending patent with the USPTO and protected under US Patent
17 |           Laws. Copyright - 2024 IBM Corporation. In case of any questions or
18 |           support, please reach out to{" "}
19 |           <a href="mailto: kunal@ibm.com">kunal@ibm.com</a>
20 |         </p>
21 |       </div>
22 |     </footer>
23 |   );
24 | }
25 | 
26 | export default Footer;
27 | 


--------------------------------------------------------------------------------
/JudgeIt-App/deployment/deployment.yaml:
--------------------------------------------------------------------------------
 1 | kind: Deployment
 2 | apiVersion: apps/v1
 3 | metadata:
 4 |   resourceVersion: '108957306'
 5 |   name: llm-judge-frontend
 6 |   labels:
 7 |     app: llm-judge-frontend
 8 | spec:
 9 |   replicas: 1
10 |   selector:
11 |     matchLabels:
12 |       app: llm-judge-frontend
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app: llm-judge-frontend
17 |         deployment: llm-judge-frontend
18 |       annotations:
19 |         openshift.io/generated-by: OpenShiftWebConsole
20 |     spec:
21 |       containers:
22 |         - name: llm-judge-frontend
23 |           image: 'image-registry.openshift-image-registry.svc:5000/llm-judge/llm-judge-frontend@sha256:5ac9b1aa09123b4d09a7e0f297e542c895350f7a700779b36df77b0897f45f46'
24 |           ports:
25 |             - containerPort: 3000
26 |               protocol: TCP
27 |           envFrom:
28 |             - secretRef:
29 |                 name: llmjudge-frontend-secret
30 |           resources: {}
31 | 


--------------------------------------------------------------------------------
/REST-Service/deployment/base/redis/deployment.yaml:
--------------------------------------------------------------------------------
 1 | kind: Deployment
 2 | apiVersion: apps/v1
 3 | metadata:
 4 |   name: redis
 5 |   labels:
 6 |     app: redis
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: redis
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: redis
16 |         deployment: redis
17 |       annotations:
18 |         openshift.io/generated-by: OpenShiftWebConsole
19 |     spec:
20 |       volumes:
21 |         - name: redis-1
22 |           emptyDir: {}
23 |       containers:
24 |         - name: redis
25 |           image: redis:7.2.5-alpine
26 |           ports:
27 |             - containerPort: 6379
28 |               protocol: TCP
29 |           resources: {}
30 |           volumeMounts:
31 |             - name: redis-1
32 |               mountPath: /data
33 |           terminationMessagePath: /dev/termination-log
34 |           terminationMessagePolicy: File
35 |           imagePullPolicy: IfNotPresent
36 |       restartPolicy: Always
37 | 


--------------------------------------------------------------------------------
/JudgeIt-App/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "judge-app",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "dev": "next dev",
 7 |     "build": "next build",
 8 |     "start": "next start",
 9 |     "lint": "next lint"
10 |   },
11 |   "dependencies": {
12 |     "@emotion/react": "^11.11.4",
13 |     "@emotion/styled": "^11.11.5",
14 |     "@mui/icons-material": "^5.16.0",
15 |     "@mui/material": "^5.16.0",
16 |     "@mui/x-data-grid": "^7.16.0",
17 |     "axios": "^1.7.2",
18 |     "chart.js": "^4.4.4",
19 |     "chartjs-plugin-datalabels": "^2.2.0",
20 |     "cryptr": "^6.3.0",
21 |     "formik": "^2.4.6",
22 |     "next": "14.2.5",
23 |     "next-auth": "^4.24.7",
24 |     "react": "^18",
25 |     "react-chartjs-2": "^5.2.0",
26 |     "react-dom": "^18",
27 |     "react-dropzone": "^14.2.3",
28 |     "react-pro-sidebar": "^1.1.0",
29 |     "uuid": "^10.0.0",
30 |     "yup": "^1.4.0"
31 |   },
32 |   "devDependencies": {
33 |     "eslint": "^8",
34 |     "eslint-config-next": "14.2.5"
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/REST-Service/app/src/config/TimeoutMiddleware.py:
--------------------------------------------------------------------------------
 1 | from fastapi.responses import JSONResponse
 2 | from starlette.middleware.base import BaseHTTPMiddleware
 3 | from fastapi import FastAPI, Request, HTTPException
 4 | import time
 5 | 
 6 | class TimeoutMiddleware(BaseHTTPMiddleware):
 7 |     def __init__(self, app, timeout: int):
 8 |         super().__init__(app)
 9 |         self.timeout = timeout
10 | 
11 |     async def dispatch(self, request: Request, call_next):
12 |         start_time = time.time()
13 |         try:
14 |             response = await call_next(request)
15 |             process_time = time.time() - start_time
16 |             if process_time > self.timeout:
17 |                 raise HTTPException(status_code=408, detail="Request Timeout")
18 |             return response
19 |         except Exception as e:
20 |             process_time = time.time() - start_time
21 |             if process_time > self.timeout:
22 |                 return JSONResponse(content={"detail": "Request Timeout"}, status_code=408)
23 |             raise e


--------------------------------------------------------------------------------
/JudgeIt-App/app/layout.js:
--------------------------------------------------------------------------------
 1 | import "../styles/globals.css";
 2 | import Footer from "@/components/globals/Footer";
 3 | import Topbar from "@/components/globals/Topbar";
 4 | import { Grid, Box, AppBar } from "@mui/material";
 5 | import SessionProviderWrapper from "@/utils/sessionProviderWrapper";
 6 | 
 7 | export const metadata = {
 8 |   title: "LLM Judge Application",
 9 |   description: "LLM Judge Application to evaluate LLM response.",
10 | };
11 | 
12 | export default function RootLayout({ children }) {
13 |   return (
14 |     <SessionProviderWrapper>
15 |       <html lang="en" suppressHydrationWarning={true}>
16 |         <body suppressHydrationWarning={true}>
17 |           <div style={{ minHeight: "90vh" }}>
18 |             <Topbar />
19 |             <Grid sx={{ flexGrow: 1 }} container spacing={0}>
20 |               <Grid item xs={12}>
21 |                 <Box>{children}</Box>
22 |               </Grid>
23 |             </Grid>
24 |           </div>
25 |         </body>
26 |       </html>
27 |     </SessionProviderWrapper>
28 |   );
29 | }
30 | 


--------------------------------------------------------------------------------
/REST-Service/chart/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: fastapi-app
 5 |   labels:
 6 |     app: fastapi-app
 7 | spec:
 8 |   type: {{ .Values.service.fastapi.type }}
 9 |   ports:
10 |     - port: {{ .Values.service.fastapi.port }}
11 |       targetPort: {{ .Values.service.fastapi.port }}
12 |   selector:
13 |     app: fastapi-app
14 | 
15 | ---
16 | 
17 | apiVersion: v1
18 | kind: Service
19 | metadata:
20 |   name: redis
21 |   labels:
22 |     app: redis
23 | spec:
24 |   type: {{ .Values.service.redis.type }}
25 |   ports:
26 |     - port: {{ .Values.service.redis.port }}
27 |       targetPort: {{ .Values.service.redis.port }}
28 |   selector:
29 |     app: redis
30 | 
31 | ---
32 | 
33 | apiVersion: v1
34 | kind: Service
35 | metadata:
36 |   name: flower
37 |   labels:
38 |     app: flower
39 | spec:
40 |   type: {{ .Values.service.flower.type }}
41 |   ports:
42 |     - port: {{ .Values.service.flower.port }}
43 |       targetPort: {{ .Values.service.flower.port }}
44 |   selector:
45 |     app: flower
46 | 


--------------------------------------------------------------------------------
/REST-Service/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi
 2 | uvicorn
 3 | certifi==2024.6.2
 4 | charset-normalizer==3.3.2
 5 | click==8.1.7
 6 | ibm-cos-sdk==2.13.5
 7 | ibm-cos-sdk-core==2.13.5
 8 | ibm-cos-sdk-s3transfer==2.13.5
 9 | ibm_watson_machine_learning==1.0.359
10 | ibm_watsonx_ai==1.0.10
11 | idna==3.7
12 | importlib_metadata==8.0.0
13 | jmespath==1.0.1
14 | joblib==1.4.2
15 | lomond==0.3.3
16 | nltk==3.8.1
17 | numpy==1.26.4
18 | packaging==24.1
19 | pandas==2.1.4
20 | python-dateutil==2.9.0.post0
21 | pytz==2024.1
22 | regex==2024.5.15
23 | requests==2.32.4
24 | rouge==1.0.1
25 | scikit-learn==1.5.0
26 | scipy==1.14.0
27 | six==1.16.0
28 | tabulate==0.9.0
29 | threadpoolctl==3.5.0
30 | tqdm==4.66.4
31 | tzdata==2024.1
32 | urllib3==2.1.0
33 | zipp==3.19.2
34 | openpyxl==3.1.5
35 | langchain-ibm==0.1.10
36 | celery==5.4.0
37 | redis==5.0.7
38 | flower==2.0.1
39 | asyncio==3.4.3
40 | python-dotenv
41 | python-multipart
42 | fuzzywuzzy==0.18.0
43 | python-Levenshtein==0.27.1
44 | ibm-watsonx-gov==1.2.2
45 | Jinja2==3.1.2
46 | jsonschema==4.25.1
47 | unitxt==1.26.6
48 | textstat==0.7.10


--------------------------------------------------------------------------------
/JudgeIt-App/styles/globals.css:
--------------------------------------------------------------------------------
 1 | @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500&family=Source+Sans+Pro:ital,wght@0,400;0,600;1,600&display=swap');
 2 | 
 3 | html,
 4 | body,
 5 | #root,
 6 | .app,
 7 | .content {
 8 |   margin: 0;
 9 |   height: 100%;
10 |   width: 100%;
11 |   font-family: 'IBM Plex Sans';
12 |   overflow: hidden;
13 | }
14 | 
15 | .app {
16 |   display: flex;
17 |   position: relative;
18 | }
19 | 
20 | ::-webkit-scrollbar {
21 |   width: 10px;
22 | }
23 | 
24 | /* Track */
25 | 
26 | ::-webkit-scrollbar-track {
27 |   background: #e0e0e0;
28 | }
29 | 
30 | /* handle */
31 | 
32 | ::-webkit-scrollbar-thumb {
33 |   background: #888;
34 | }
35 | 
36 | /* handle on Hover */
37 | 
38 | ::-webkit-scrollbar-track:hover {
39 |   background: #555;
40 | }
41 | 
42 | .drag-and-drop {
43 |   width: 100%;
44 |   height: 200px;
45 |   border: 2px dashed #ccc;
46 |   border-radius: 5px;
47 |   display: flex;
48 |   justify-content: center;
49 |   align-items: center;
50 |   cursor: pointer;
51 | }
52 | 
53 | .dragging {
54 |   background-color: #f1f1f1;
55 | }


--------------------------------------------------------------------------------
/REST-Service/deployment/base/flower/deployment.yaml:
--------------------------------------------------------------------------------
 1 | kind: Deployment
 2 | apiVersion: apps/v1
 3 | metadata:
 4 |   name: flower-app
 5 |   labels:
 6 |     app: flower-app
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: flower-app
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: flower-app
16 |         deployment: flower-app
17 |     spec:
18 |       containers:
19 |         - resources: {}
20 |           terminationMessagePath: /dev/termination-log
21 |           name: flower-app
22 |           command:
23 |             - celery
24 |             - '--broker=redis://redis:6379/0'
25 |             - flower
26 |             - '--port=5555'
27 |           ports:
28 |             - containerPort: 5555
29 |               protocol: TCP
30 |             - containerPort: 8080
31 |               protocol: TCP
32 |           imagePullPolicy: IfNotPresent
33 |           terminationMessagePolicy: File
34 |           envFrom:
35 |             - secretRef:
36 |                 name: llm-judge-secret
37 |           image: backend-image-name:latest
38 |       


--------------------------------------------------------------------------------
/REST-Service/cert/mongo.crt:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE-----
 2 | MIIDDzCCAfegAwIBAgIJANEH58y2/kzHMA0GCSqGSIb3DQEBCwUAMB4xHDAaBgNV
 3 | BAMME0lCTSBDbG91ZCBEYXRhYmFzZXMwHhcNMTgwNjI1MTQyOTAwWhcNMjgwNjIy
 4 | MTQyOTAwWjAeMRwwGgYDVQQDDBNJQk0gQ2xvdWQgRGF0YWJhc2VzMIIBIjANBgkq
 5 | hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA8lpaQGzcFdGqeMlmqjffMPpIQhqpd8qJ
 6 | Pr3bIkrXJbTcJJ9uIckSUcCjw4Z/rSg8nnT13SCcOl+1to+7kdMiU8qOWKiceYZ5
 7 | y+yZYfCkGaiZVfazQBm45zBtFWv+AB/8hfCTdNF7VY4spaA3oBE2aS7OANNSRZSK
 8 | pwy24IUgUcILJW+mcvW80Vx+GXRfD9Ytt6PRJgBhYuUBpgzvngmCMGBn+l2KNiSf
 9 | weovYDCD6Vngl2+6W9QFAFtWXWgF3iDQD5nl/n4mripMSX6UG/n6657u7TDdgkvA
10 | 1eKI2FLzYKpoKBe5rcnrM7nHgNc/nCdEs5JecHb1dHv1QfPm6pzIxwIDAQABo1Aw
11 | TjAdBgNVHQ4EFgQUK3+XZo1wyKs+DEoYXbHruwSpXjgwHwYDVR0jBBgwFoAUK3+X
12 | Zo1wyKs+DEoYXbHruwSpXjgwDAYDVR0TBAUwAwEB/zANBgkqhkiG9w0BAQsFAAOC
13 | AQEAJf5dvlzUpqaix26qJEuqFG0IP57QQI5TCRJ6Xt/supRHo63eDvKw8zR7tlWQ
14 | lV5P0N2xwuSl9ZqAJt7/k/3ZeB+nYwPoyO3KvKvATunRvlPBn4FWVXeaPsG+7fhS
15 | qsejmkyonYw77HRzGOzJH4Zg8UN6mfpbaWSsyaExvqknCp9SoTQP3D67AzWqb1zY
16 | doqqgGIZ2nxCkp5/FXxF/TMb55vteTQwfgBy60jVVkbF7eVOWCv0KaNHPF5hrqbN
17 | i+3XjJ7/peF3xMvTMoy35DcT3E2ZeSVjouZs15O90kI3k2daS2OHJABW0vSj4nLz
18 | +PQzp/B9cQmOO8dCe049Q3oaUA==
19 | -----END CERTIFICATE-----
20 | 
21 | 


--------------------------------------------------------------------------------
/REST-Service/deployment/base/celery-worker/deployment.yaml:
--------------------------------------------------------------------------------
 1 | kind: Deployment
 2 | apiVersion: apps/v1
 3 | metadata:
 4 |   name: celery-worker
 5 |   labels:
 6 |     app: celery-worker
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: celery-worker
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: celery-worker
16 |         deployment: celery-worker
17 |       annotations:
18 |         openshift.io/generated-by: OpenShiftWebConsole
19 |     spec:
20 |       containers:
21 |         - resources: {}
22 |           terminationMessagePath: /dev/termination-log
23 |           name: celery-worker
24 |           command:
25 |             - celery
26 |             - '-A'
27 |             - app.celery.celery_worker.celery
28 |             - worker
29 |             - '--loglevel=info'
30 |           ports:
31 |             - containerPort: 3001
32 |               protocol: TCP
33 |             - containerPort: 8080
34 |               protocol: TCP
35 |           imagePullPolicy: IfNotPresent
36 |           terminationMessagePolicy: File
37 |           envFrom:
38 |             - secretRef:
39 |                 name: llm-judge-secret
40 |           image: backend-image-name:latest
41 |      


--------------------------------------------------------------------------------
/REST-Service/deployment/base/rest-app/deployment.yaml:
--------------------------------------------------------------------------------
 1 | kind: Deployment
 2 | apiVersion: apps/v1
 3 | metadata:
 4 |   name: llm-judge-backend
 5 |   labels:
 6 |     app: llm-judge-backend
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: llm-judge-backend
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: llm-judge-backend
16 |         deployment: llm-judge-backend
17 |     spec:
18 |       containers:
19 |         - resources: {}
20 |           terminationMessagePath: /dev/termination-log
21 |           name: llm-judge-backend
22 |           ports:
23 |             - containerPort: 3001
24 |               protocol: TCP
25 |             - containerPort: 8080
26 |               protocol: TCP
27 |           imagePullPolicy: IfNotPresent
28 |           envFrom:
29 |             - secretRef:
30 |                 name: llm-judge-secret
31 |           image: backend-image-name:latest
32 |           volumeMounts:
33 |             - name: mongodb-cert-volume
34 |               readOnly: true
35 |               mountPath: /app/backend/cert
36 |       volumes:
37 |         - name: mongodb-cert-volume
38 |           secret:
39 |             secretName: mongodb-cert-secret
40 |             defaultMode: 420


--------------------------------------------------------------------------------
/JudgeIt-App/public/next.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 394 80"><path fill="#000" d="M262 0h68.5v12.7h-27.2v66.6h-13.6V12.7H262V0ZM149 0v12.7H94v20.4h44.3v12.6H94v21h55v12.6H80.5V0h68.7zm34.3 0h-17.8l63.8 79.4h17.9l-32-39.7 32-39.6h-17.9l-23 28.6-23-28.6zm18.3 56.7-9-11-27.1 33.7h17.8l18.3-22.7z"/><path fill="#000" d="M81 79.3 17 0H0v79.3h13.6V17l50.2 62.3H81Zm252.6-.4c-1 0-1.8-.4-2.5-1s-1.1-1.6-1.1-2.6.3-1.8 1-2.5 1.6-1 2.6-1 1.8.3 2.5 1a3.4 3.4 0 0 1 .6 4.3 3.7 3.7 0 0 1-3 1.8zm23.2-33.5h6v23.3c0 2.1-.4 4-1.3 5.5a9.1 9.1 0 0 1-3.8 3.5c-1.6.8-3.5 1.3-5.7 1.3-2 0-3.7-.4-5.3-1s-2.8-1.8-3.7-3.2c-.9-1.3-1.4-3-1.4-5h6c.1.8.3 1.6.7 2.2s1 1.2 1.6 1.5c.7.4 1.5.5 2.4.5 1 0 1.8-.2 2.4-.6a4 4 0 0 0 1.6-1.8c.3-.8.5-1.8.5-3V45.5zm30.9 9.1a4.4 4.4 0 0 0-2-3.3 7.5 7.5 0 0 0-4.3-1.1c-1.3 0-2.4.2-3.3.5-.9.4-1.6 1-2 1.6a3.5 3.5 0 0 0-.3 4c.3.5.7.9 1.3 1.2l1.8 1 2 .5 3.2.8c1.3.3 2.5.7 3.7 1.2a13 13 0 0 1 3.2 1.8 8.1 8.1 0 0 1 3 6.5c0 2-.5 3.7-1.5 5.1a10 10 0 0 1-4.4 3.5c-1.8.8-4.1 1.2-6.8 1.2-2.6 0-4.9-.4-6.8-1.2-2-.8-3.4-2-4.5-3.5a10 10 0 0 1-1.7-5.6h6a5 5 0 0 0 3.5 4.6c1 .4 2.2.6 3.4.6 1.3 0 2.5-.2 3.5-.6 1-.4 1.8-1 2.4-1.7a4 4 0 0 0 .8-2.4c0-.9-.2-1.6-.7-2.2a11 11 0 0 0-2.1-1.4l-3.2-1-3.8-1c-2.8-.7-5-1.7-6.6-3.2a7.2 7.2 0 0 1-2.4-5.7 8 8 0 0 1 1.7-5 10 10 0 0 1 4.3-3.5c2-.8 4-1.2 6.4-1.2 2.3 0 4.4.4 6.2 1.2 1.8.8 3.2 2 4.3 3.4 1 1.4 1.5 3 1.5 5h-5.8z"/></svg>


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/icons/IBMIconTop.jsx:
--------------------------------------------------------------------------------
 1 | import * as React from 'react';
 2 | import SvgIcon from '@mui/material/SvgIcon';
 3 | 
 4 | export default function IBMIcon() {
 5 | 
 6 |     return (
 7 |         <SvgIcon style={{ height: '42px', width: '70px', color: "black" }}>
 8 |             <path d="M0 9.6v.876h6.23V9.6zm7.11 0v.876h8.876c0 0-.907-.876-2.107-.876zm10.618 0v.876h5.37l-.32-.876zm9.22 0l-.32.876h5.32V9.6zM0 11.303v.876h6.23v-.876zm7.11.001v.875h9.906c0 0-.116-.674-.317-.875zm10.618 0v.875h5.96l-.295-.875zm8.583 0l-.295.875h5.935v-.875zM1.8 13.006v.877h2.697v-.877zm7.11 0v.877H11.6v-.877zm5.32 0v.877h2.698c0 0 .17-.463.17-.877zm5.296 0v.877H24.3l-.32-.877zm6.205 0l-.32.877H30.2v-.877zM1.8 14.71v.876h2.697v-.876zm7.11 0v.876h6.9c0 0 .576-.45.76-.876zm10.618 0v.876h2.697V15.1l.17.487h4.94l.184-.487v.487H30.2v-.876h-5.064l-.27.742-.27-.742zM1.8 16.414v.876h2.697v-.876zm7.11 0v.876h7.65c-.184-.425-.76-.876-.76-.876zm10.618 0v.876h2.697v-.876zm3.188 0l.326.876h3.705l.3-.876zm4.806 0v.876H30.2v-.876zM1.8 18.117v.876h2.697v-.876zm7.11 0v.876H11.6v-.876zm5.32 0v.876h2.87c0-.413-.17-.876-.17-.876zm5.296 0v.876h2.697v-.876zm3.8 0l.316.876h2.484l.32-.876zm4.194 0v.876H30.2v-.876zM.05 19.82v.877h6.23v-.877zm7.063 0v.877h9.59c.202-.2.317-.877.317-.877zm10.666 0v.877h4.44v-.877zm6.155 0l.325.877h1.264l.305-.877zm3.58 0v.877H32v-.877zM.05 21.524v.876h6.23v-.876zm7.063 0v.875h6.77c1.2 0 2.108-.875 2.108-.875zm10.666 0v.876h4.44v-.876zm6.77 0l.313.873.054.001.318-.875zm2.964 0v.876H32v-.876z" />
 9 |         </SvgIcon>
10 |     );
11 | }


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/SignIn.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import { signIn } from "next-auth/react";
 4 | import { useSearchParams } from "next/navigation";
 5 | import IBMIcon from "./icons/IBMIcon";
 6 | import { LineWeight } from "@mui/icons-material";
 7 | import { Grid } from "@mui/material";
 8 | import React, { Suspense } from "react";
 9 | 
10 | function SignInWithIBMIdContent() {
11 |   const searchParams = useSearchParams();
12 |   const callbackUrl = searchParams.get("callbackUrl") || "/";
13 | 
14 |   return (
15 |     <Grid>
16 |       <Grid item xs={12}>
17 |         <div
18 |           style={{
19 |             display: "flex",
20 |             justifyContent: "center",
21 |             alignItems: "center",
22 |             height: "80vh",
23 |           }}
24 |         >
25 |           <button onClick={() => signIn("IBMid", { callbackUrl: callbackUrl })}>
26 |             <span
27 |               style={{
28 |                 display: "flex",
29 |                 alignItems: "center",
30 |                 padding: "5px",
31 |                 fontFamily: "IBM Plex Sans",
32 |               }}
33 |             >
34 |               <IBMIcon />
35 |               <span style={{ marginLeft: "5px" }}>Sign in with IBMid</span>
36 |             </span>
37 |           </button>
38 |         </div>
39 |       </Grid>
40 |     </Grid>
41 |   );
42 | }
43 | 
44 | export default function SignInWithIBMId() {
45 |   return (
46 |     <Suspense fallback={<div>Loading...</div>}>
47 |       <SignInWithIBMIdContent />
48 |     </Suspense>
49 |   );
50 | }
51 | 


--------------------------------------------------------------------------------
/REST-Service/app/src/services/MongoService.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | from pymongo import MongoClient
 4 | from pymongo.errors import ConnectionFailure
 5 | from bson.objectid import ObjectId
 6 | 
 7 | load_dotenv()
 8 | 
 9 | class MongoService:
10 | 
11 |     def __init__(self):
12 |         # MongoDB backend
13 |         MONGO_URL=os.getenv('MONGO_URL')
14 |         MONGO_USER=os.getenv('MONGO_USER')
15 |         MONGO_PASS=os.getenv('MONGO_PASS')
16 | 
17 |         self.MONGO_DB=os.getenv('MONGO_DB')        
18 |         
19 |         ##f"mongodb://{MONGO_USER}:{MONGO_PASS}@{MONGO_URL}"
20 | 
21 |         client = MongoClient(
22 |             
23 |             f"mongodb://{MONGO_USER}:{MONGO_PASS}@{MONGO_URL}/{self.MONGO_DB}?authSource={self.MONGO_DB}",
24 |             ssl=True,
25 |             tlsCAFile="cert/mongo.crt"
26 |         )
27 |         self.client = client        
28 |         print(f"mongo client:{client}yyyy")
29 |     
30 |     def get_db(self):
31 |         db = self.client[self.MONGO_DB]
32 |         return db
33 | 
34 |     def get_collection(self, collection_name):
35 |         collection = self.get_db()[collection_name]
36 |         return collection
37 |     
38 |     def get_request_history_collection(self):
39 |         return self.get_collection('request_histories')
40 |     
41 |     def get_experiment_collection(self):
42 |         return self.get_collection('experiments')
43 |     
44 |     def find_one(self, collection, id):
45 |         one = collection.find_one({'_id': ObjectId(id)})
46 |         return one
47 | 
48 |     
49 | 
50 |     
51 | 
52 |     


--------------------------------------------------------------------------------
/JudgeIt-App/public/vercel.svg:
--------------------------------------------------------------------------------
1 | <svg>
2 |     <path
3 |       d="M58 21.467V23h-7.632v-1.533H58zm-18.316 0V23h-7.631v-1.533h7.631zm5.955 0L45.025 23l-.606-1.533h1.22zm-17.097 0A6.285 6.285 0 0 1 24.391 23H12.21v-1.533zm-17.858 0V23H0v-1.533h10.684zm29-3.067v1.533h-7.631V18.4h7.631zm7.148 0l-.594 1.533H43.82l-.598-1.533h3.609zm-16.764 0a5.719 5.719 0 0 1-.64 1.533H12.21V18.4zm-19.384 0v1.533H0V18.4h10.684zM58 18.4v1.533h-7.632V18.4H58zm-3.053-3.067v1.534h-4.579v-1.534h4.58zm-15.263 0v1.534h-4.579v-1.534h4.58zm8.345 0l-.6 1.534h-4.806l-.604-1.534h6.01zm-18.174 0c.137.49.213 1.003.213 1.534h-5.647v-1.534zm-10.013 0v1.534h-4.579v-1.534h4.58zm-12.21 0v1.534h-4.58v-1.534h4.58zm47.315-3.066V13.8h-4.579v-1.533h4.58zm-15.263 0V13.8h-4.579v-1.533h4.58zm9.541 0l-.597 1.533h-7.22l-.591-1.533h8.408zm-21.248 0c.527.432.98.951 1.328 1.533H15.263v-1.533zm-20.345 0V13.8h-4.58v-1.533h4.58zM44.599 9.2l.427 1.24.428-1.24h9.493v1.533h-4.579V9.324l-.519 1.41h-9.661l-.504-1.41v1.41h-4.579V9.2H44.6zm-36.967 0v1.533h-4.58V9.2h4.58zm21.673 0a5.95 5.95 0 0 1-1.328 1.533H15.263V9.2zm25.642-3.067v1.534h-8.964l.54-1.534h8.424zm-11.413 0l.54 1.534h-8.969V6.133h8.43zm-13.466 0c0 .531-.076 1.045-.213 1.534H24.42V6.133zm-10.226 0v1.534h-4.579V6.133h4.58zm-12.21 0v1.534h-4.58V6.133h4.58zm34.845-3.066l.53 1.533H32.054V3.067h10.424zm15.523 0V4.6H47.04l.55-1.533H58zm-28.573 0c.284.473.504.988.641 1.533H12.211V3.067zm-18.743 0V4.6H0V3.067h10.684zM41.406 0l.54 1.533h-9.893V0h9.353zM58 0v1.533h-9.881L48.647 0H58zM24.39 0c1.601 0 3.057.581 4.152 1.533H12.211V0zM10.685 0v1.533H0V0h10.684z"
4 |       fill="#161616"
5 |       fillRule="evenodd"
6 |     ></path>
7 | </svg>


--------------------------------------------------------------------------------
/REST-Service/deployment/base/rest-app/secret.yaml:
--------------------------------------------------------------------------------
1 | kind: Secret
2 | apiVersion: v1
3 | metadata:
4 |   name: mongodb-cert-secret
5 | data:
6 |   mongo.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUREekNDQWZlZ0F3SUJBZ0lKQU5FSDU4eTIva3pITUEwR0NTcUdTSWIzRFFFQkN3VUFNQjR4SERBYUJnTlYKQkFNTUUwbENUU0JEYkc5MVpDQkVZWFJoWW1GelpYTXdIaGNOTVRnd05qSTFNVFF5T1RBd1doY05Namd3TmpJeQpNVFF5T1RBd1dqQWVNUnd3R2dZRFZRUUREQk5KUWswZ1EyeHZkV1FnUkdGMFlXSmhjMlZ6TUlJQklqQU5CZ2txCmhraUc5dzBCQVFFRkFBT0NBUThBTUlJQkNnS0NBUUVBOGxwYVFHemNGZEdxZU1sbXFqZmZNUHBJUWhxcGQ4cUoKUHIzYklrclhKYlRjSko5dUlja1NVY0NqdzRaL3JTZzhublQxM1NDY09sKzF0bys3a2RNaVU4cU9XS2ljZVlaNQp5K3laWWZDa0dhaVpWZmF6UUJtNDV6QnRGV3YrQUIvOGhmQ1RkTkY3Vlk0c3BhQTNvQkUyYVM3T0FOTlNSWlNLCnB3eTI0SVVnVWNJTEpXK21jdlc4MFZ4K0dYUmZEOVl0dDZQUkpnQmhZdVVCcGd6dm5nbUNNR0JuK2wyS05pU2YKd2VvdllEQ0Q2Vm5nbDIrNlc5UUZBRnRXWFdnRjNpRFFENW5sL240bXJpcE1TWDZVRy9uNjY1N3U3VERkZ2t2QQoxZUtJMkZMellLcG9LQmU1cmNuck03bkhnTmMvbkNkRXM1SmVjSGIxZEh2MVFmUG02cHpJeHdJREFRQUJvMUF3ClRqQWRCZ05WSFE0RUZnUVVLMytYWm8xd3lLcytERW9ZWGJIcnV3U3BYamd3SHdZRFZSMGpCQmd3Rm9BVUszK1gKWm8xd3lLcytERW9ZWGJIcnV3U3BYamd3REFZRFZSMFRCQVV3QXdFQi96QU5CZ2txaGtpRzl3MEJBUXNGQUFPQwpBUUVBSmY1ZHZselVwcWFpeDI2cUpFdXFGRzBJUDU3UVFJNVRDUko2WHQvc3VwUkhvNjNlRHZLdzh6Ujd0bFdRCmxWNVAwTjJ4d3VTbDlacUFKdDcvay8zWmVCK25Zd1BveU8zS3ZLdkFUdW5SdmxQQm40RldWWGVhUHNHKzdmaFMKcXNlam1reW9uWXc3N0hSekdPekpINFpnOFVONm1mcGJhV1NzeWFFeHZxa25DcDlTb1RRUDNENjdBeldxYjF6WQpkb3FxZ0dJWjJueENrcDUvRlh4Ri9UTWI1NXZ0ZVRRd2ZnQnk2MGpWVmtiRjdlVk9XQ3YwS2FOSFBGNWhycWJOCmkrM1hqSjcvcGVGM3hNdlRNb3kzNURjVDNFMlplU1Zqb3VaczE1Tzkwa0kzazJkYVMyT0hKQUJXMHZTajRuTHoKK1BRenAvQjljUW1PTzhkQ2UwNDlRM29hVUE9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCgo=
7 | type: Opaque
8 | 


--------------------------------------------------------------------------------
/REST-Service/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI, Request, HTTPException
 2 | from fastapi.middleware.trustedhost import TrustedHostMiddleware
 3 | import uvicorn
 4 | import logging
 5 | from dotenv import load_dotenv
 6 | from app.route.root import routes as root_api
 7 | from app.route.llm_judge import routes as llm_judge_api
 8 | from app.route.llm_manage import routes as judge_management_api
 9 | from fastapi.middleware.cors import CORSMiddleware
10 | import os
11 | from app.src.config.TimeoutMiddleware import TimeoutMiddleware
12 | 
13 | load_dotenv()
14 | platform = os.environ.get("PLATFORM")
15 | server_url = os.environ.get("SERVER_URL", default="http://localhost:3001")
16 | 
17 | app = FastAPI(
18 |     title="LLM JUDGE API",
19 |     description="This api will be used to judge llm response and get ratings and feedback",
20 |     version="1.0.1-fastapi",
21 |     servers=[
22 |         {
23 |             "url": server_url
24 |         }
25 |     ],
26 | )
27 | 
28 | logging.basicConfig(level=logging.INFO)
29 | logger = logging.getLogger('api-service')
30 | 
31 | # Register blueprints
32 | app.include_router(root_api.root_api_route)
33 | app.include_router(llm_judge_api.judge_api_route)
34 | app.include_router(judge_management_api.judge_management_api_route)
35 | 
36 | origins = [ "*"]
37 | 
38 | app.add_middleware(
39 |     CORSMiddleware,
40 |     allow_origins=origins,
41 |     allow_credentials=False,
42 |     allow_methods=["*"],
43 |     allow_headers=["*"],
44 | )
45 | 
46 | app.add_middleware(TimeoutMiddleware, timeout=600)  # Timeout set to 600 seconds (10 minutes)
47 | 
48 | if __name__ == '__main__':
49 |     uvicorn.run("main:app", host='0.0.0.0', port=3001)


--------------------------------------------------------------------------------
/JudgeIt-App/app/pages/help/page.js:
--------------------------------------------------------------------------------
 1 | import BatchInstructions from "@/components/globals/BatchInstructions";
 2 | import Footer from "@/components/globals/Footer";
 3 | import SingleInstructions from "@/components/globals/SingleInstructions";
 4 | import { Box, Grid, Paper, Typography } from "@mui/material";
 5 | import React from "react";
 6 | 
 7 | const HelperPage = () => {
 8 |   return (
 9 |     <div
10 |       style={{
11 |         marginTop: "20px",
12 |         marginLeft: "50px",
13 |         height: "90vh",
14 |         overflowY: "scroll",
15 |         marginBottom: "20px",
16 |       }}
17 |     >
18 |       <Grid spacing={5} sx={{ flexGrow: 1 }} container>
19 |         <Grid item xs={12} justifyContent={"center"} display={"flex"}>
20 |           <Typography
21 |             style={{
22 |               fontSize: "36px",
23 |               color: "#3B3B3B",
24 |               fontWeight: "bold",
25 |             }}
26 |           >
27 |             Documentation
28 |           </Typography>
29 |         </Grid>
30 |         <Grid item xs={11} spacing={5} sx={{ flexGrow: 1 }} container>
31 |           <Grid item xs={6}>
32 |             <Box
33 |               border={"1px solid grey"}
34 |               borderRadius={"5px"}
35 |               padding={"10px"}
36 |             >
37 |               <SingleInstructions />
38 |             </Box>
39 |           </Grid>
40 |           <Grid item xs={6}>
41 |             <Box
42 |               border={"1px solid grey"}
43 |               borderRadius={"5px"}
44 |               padding={"10px"}
45 |             >
46 |               <BatchInstructions />
47 |             </Box>
48 |           </Grid>
49 |         </Grid>
50 |       </Grid>
51 |       <Grid item xs={12}>
52 |         <Footer />
53 |       </Grid>
54 |     </div>
55 |   );
56 | };
57 | 
58 | export default HelperPage;
59 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/icons/IBMIcon.jsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | import SvgIcon from "@mui/material/SvgIcon";
 3 | 
 4 | export default function IBMIcon() {
 5 |   return (
 6 |     <SvgIcon style={{ height: "20px", width: "90px" }}>
 7 |       <path
 8 |         d="M58 21.467V23h-7.632v-1.533H58zm-18.316 0V23h-7.631v-1.533h7.631zm5.955 0L45.025 23l-.606-1.533h1.22zm-17.097 0A6.285 6.285 0 0 1 24.391 23H12.21v-1.533zm-17.858 0V23H0v-1.533h10.684zm29-3.067v1.533h-7.631V18.4h7.631zm7.148 0l-.594 1.533H43.82l-.598-1.533h3.609zm-16.764 0a5.719 5.719 0 0 1-.64 1.533H12.21V18.4zm-19.384 0v1.533H0V18.4h10.684zM58 18.4v1.533h-7.632V18.4H58zm-3.053-3.067v1.534h-4.579v-1.534h4.58zm-15.263 0v1.534h-4.579v-1.534h4.58zm8.345 0l-.6 1.534h-4.806l-.604-1.534h6.01zm-18.174 0c.137.49.213 1.003.213 1.534h-5.647v-1.534zm-10.013 0v1.534h-4.579v-1.534h4.58zm-12.21 0v1.534h-4.58v-1.534h4.58zm47.315-3.066V13.8h-4.579v-1.533h4.58zm-15.263 0V13.8h-4.579v-1.533h4.58zm9.541 0l-.597 1.533h-7.22l-.591-1.533h8.408zm-21.248 0c.527.432.98.951 1.328 1.533H15.263v-1.533zm-20.345 0V13.8h-4.58v-1.533h4.58zM44.599 9.2l.427 1.24.428-1.24h9.493v1.533h-4.579V9.324l-.519 1.41h-9.661l-.504-1.41v1.41h-4.579V9.2H44.6zm-36.967 0v1.533h-4.58V9.2h4.58zm21.673 0a5.95 5.95 0 0 1-1.328 1.533H15.263V9.2zm25.642-3.067v1.534h-8.964l.54-1.534h8.424zm-11.413 0l.54 1.534h-8.969V6.133h8.43zm-13.466 0c0 .531-.076 1.045-.213 1.534H24.42V6.133zm-10.226 0v1.534h-4.579V6.133h4.58zm-12.21 0v1.534h-4.58V6.133h4.58zm34.845-3.066l.53 1.533H32.054V3.067h10.424zm15.523 0V4.6H47.04l.55-1.533H58zm-28.573 0c.284.473.504.988.641 1.533H12.211V3.067zm-18.743 0V4.6H0V3.067h10.684zM41.406 0l.54 1.533h-9.893V0h9.353zM58 0v1.533h-9.881L48.647 0H58zM24.39 0c1.601 0 3.057.581 4.152 1.533H12.211V0zM10.685 0v1.533H0V0h10.684z"
 9 |         fill="#161616"
10 |         fillRule="evenodd"
11 |       ></path>
12 |     </SvgIcon>
13 |   );
14 | }
15 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/DeleteConfirmationDialog.jsx:
--------------------------------------------------------------------------------
 1 | import React, { useState } from 'react';
 2 | import Button from '@mui/material/Button';
 3 | import Dialog from '@mui/material/Dialog';
 4 | import DialogActions from '@mui/material/DialogActions';
 5 | import DialogContent from '@mui/material/DialogContent';
 6 | import DialogContentText from '@mui/material/DialogContentText';
 7 | import DialogTitle from '@mui/material/DialogTitle';
 8 | 
 9 | const DeleteConfirmationDialog = ({ itemName, onDelete }) => {
10 |   const [open, setOpen] = useState(false);
11 | 
12 |   const handleClickOpen = () => {
13 |     setOpen(true);
14 |   };
15 | 
16 |   const handleClose = () => {
17 |     setOpen(false);
18 |   };
19 | 
20 |   const handleConfirmDelete = () => {
21 |     onDelete();  // Call the delete action
22 |     handleClose();  // Close the dialog
23 |   };
24 | 
25 |   return (
26 |     <div>
27 |       <Button variant="outlined" color="secondary" onClick={handleClickOpen}>
28 |         Delete {itemName}
29 |       </Button>
30 |       <Dialog
31 |         open={open}
32 |         onClose={handleClose}
33 |         aria-labelledby="delete-confirmation-dialog-title"
34 |         aria-describedby="delete-confirmation-dialog-description"
35 |       >
36 |         <DialogTitle id="delete-confirmation-dialog-title">
37 |           {"Confirm Delete"}
38 |         </DialogTitle>
39 |         <DialogContent>
40 |           <DialogContentText id="delete-confirmation-dialog-description">
41 |             Are you sure you want to delete {itemName}? This action cannot be undone.
42 |           </DialogContentText>
43 |         </DialogContent>
44 |         <DialogActions>
45 |           <Button onClick={handleClose} color="primary">
46 |             Cancel
47 |           </Button>
48 |           <Button onClick={handleConfirmDelete} color="secondary" autoFocus>
49 |             Confirm Delete
50 |           </Button>
51 |         </DialogActions>
52 |       </Dialog>
53 |     </div>
54 |   );
55 | };
56 | 
57 | export default DeleteConfirmationDialog;
58 | 


--------------------------------------------------------------------------------
/REST-Service/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   fastapi_app:
 3 |     container_name: fastapi_app
 4 |     platform: linux/amd64
 5 |     image: fastapi_app_image
 6 |     #volumes:
 7 |     #  - ./app:/app
 8 |     ports:
 9 |       - 3001:3001
10 |     environment:
11 |       - WATSONX_URL=https://us-south.ml.cloud.ibm.com
12 |       - WX_PROJECT_ID=***
13 |       - IBM_CLOUD_API_KEY=***
14 |       - LLM_JUDGE_API_KEY=JudgeIt-Secret-Api-Key
15 |       - WX_PLATFORM=saas
16 |       - WX_USER=''
17 |       - WX_GOV_REGION=eu-de
18 |       - CELERY_BROKER_URL=redis://redis:6379/0
19 |       - CELERY_RESULT_BACKEND=redis://redis:6379/0
20 |       - SERVER_URL=http://localhost:3001
21 |       - MONGO_URL=***
22 |       - MONGO_USER=***
23 |       - MONGO_PASS=***
24 |       - MONGO_DB=judge_it_dev
25 |       - WX_NEG_TEST_MODEL=mistralai/mistral-medium-2505
26 |       - WX_GOV_INSTANCE=
27 |     restart: always
28 |   redis:
29 |     container_name: redis
30 |     image: redis:7.2.5-alpine
31 |     restart: always
32 |   celery_worker:
33 |     container_name: celery_worker
34 |     build: .
35 |     #volumes:
36 |     #  - ./app:/app
37 |     command: celery -A app.celery.celery_worker.celery worker --loglevel=info
38 |     environment:
39 |       - WATSONX_URL=https://us-south.ml.cloud.ibm.com
40 |       - WX_PROJECT_ID=***
41 |       - WX_PLATFORM=saas
42 |       - WX_USER=''
43 |       - WX_GOV_REGION=eu-de
44 |       - IBM_CLOUD_API_KEY=***
45 |       - CELERY_BROKER_URL=redis://redis:6379/0
46 |       - CELERY_RESULT_BACKEND=redis://redis:6379/0
47 |       - WX_NEG_TEST_MODEL=mistralai/mistral-medium-2505
48 |       - WX_GOV_INSTANCE=
49 |     depends_on:
50 |       - fastapi_app
51 |       - redis
52 |     restart: always
53 |   flower:
54 |     container_name: flower
55 |     build: .
56 |     command: celery --broker=redis://redis:6379/0 flower --port=5555
57 |     ports:
58 |       - 5556:5555
59 |     environment:
60 |       - CELERY_BROKER_URL=redis://redis:6379/0
61 |       - CELERY_RESULT_BACKEND=redis://redis:6379/0
62 |     depends_on:
63 |       - fastapi_app
64 |       - redis
65 |       - celery_worker
66 |     restart: always


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/MultiTurnWithConversationForm.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import React from "react";
 3 | import { TextField, Box } from "@mui/material";
 4 | 
 5 | const MultiTurnWithConversationForm = ({
 6 |   values,
 7 |   handleChange,
 8 |   handleBlur,
 9 |   errors,
10 |   touched,
11 | }) => {
12 |   return (
13 |     <div>
14 |       <Box marginBottom={"20px"} margin={"20px"}>
15 |         <TextField
16 |           label="Conversation history"
17 |           name="conversation_history"
18 |           value={values.conversation_history}
19 |           onChange={handleChange}
20 |           onBlur={handleBlur}
21 |           error={touched.conversation_history && Boolean(errors.conversation_history)}
22 |           helperText={touched.conversation_history && errors.conversation_history}
23 |           style={{ width: "100%" }}
24 |           rows={"8"}
25 |           multiline
26 |         />
27 |       </Box>
28 |       <Box marginBottom={"20px"} marginLeft={"20px"} marginRight={'20px'}>
29 |         <TextField
30 |           label="Follow up query"
31 |           name="follow_up_query"
32 |           value={values.follow_up_query}
33 |           onChange={handleChange}
34 |           onBlur={handleBlur}
35 |           error={touched.follow_up_query && Boolean(errors.follow_up_query)}
36 |           helperText={touched.follow_up_query && errors.follow_up_query}
37 |           style={{ width: "100%" }}
38 |         />
39 |       </Box>
40 |       <Box marginBottom={"20px"} marginLeft={"20px"} marginRight={'20px'}>
41 |         <TextField
42 |           label="Golden query"
43 |           name="golden_query"
44 |           value={values.golden_query}
45 |           onChange={handleChange}
46 |           onBlur={handleBlur}
47 |           error={touched.golden_query && Boolean(errors.golden_query)}
48 |           helperText={touched.golden_query && errors.golden_query}
49 |           style={{ width: "100%" }}
50 |         />
51 |       </Box>
52 |       <Box marginBottom={"20px"} marginLeft={"20px"} marginRight={'20px'}>
53 |         <TextField
54 |           label="Rewritten query "
55 |           name="rewritten_query"
56 |           value={values.rewritten_query}
57 |           onChange={handleChange}
58 |           onBlur={handleBlur}
59 |           error={touched.rewritten_query && Boolean(errors.rewritten_query)}
60 |           helperText={touched.rewritten_query && errors.rewritten_query}
61 |           style={{ width: "100%" }}
62 |         />
63 |       </Box>
64 |     </div>
65 |   );
66 | };
67 | 
68 | export default MultiTurnWithConversationForm;
69 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/SoloResult.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import {
 3 |   Alert,
 4 |   Table,
 5 |   TableHead,
 6 |   TableRow,
 7 |   TableCell,
 8 |   TableBody,
 9 |   Paper,
10 | } from "@mui/material";
11 | import {
12 |   API_TYPE_MULTITURN,
13 |   API_TYPE_SINGLETURN,
14 |   API_TYPE_RATING,
15 |   API_TYPE_SIMILARITY,
16 | } from "@/services/Config";
17 | 
18 | import { grade_map_rating, grade_map_similarity, grade_map_multiturn } from "@/services/Config";
19 | 
20 | const grade_col_name = "JudgeIt Score"
21 | const explanation_col_name = "JudgeIt Reasoning"
22 | 
23 | const SoloResult = ({ data, api_type }) => {
24 |   return (
25 |     <Paper
26 |       elevation={2}
27 |       sx={{ width: "95%", marginBottom: "10px", backgroundColor: "#F0F7FF" }}
28 |     >
29 |       <Table>
30 |         <TableHead>
31 |           {api_type === API_TYPE_RATING && (
32 |             <TableRow>
33 |               <TableCell sx={{fontWeight: "bold"}}>{grade_col_name}</TableCell>
34 |               <TableCell sx={{fontWeight: "bold"}}>{explanation_col_name}</TableCell>
35 |             </TableRow>
36 |           )}
37 |           {api_type === API_TYPE_SIMILARITY && (
38 |             <TableRow>
39 |               <TableCell sx={{fontWeight: "bold"}}>{grade_col_name}</TableCell>
40 |               <TableCell sx={{fontWeight: "bold"}}>{explanation_col_name}</TableCell>
41 |             </TableRow>
42 |           )}
43 |           {(api_type === API_TYPE_MULTITURN || api_type === API_TYPE_SINGLETURN) && (
44 |             <TableRow>
45 |               <TableCell sx={{fontWeight: "bold"}}>{grade_col_name}</TableCell>
46 |             </TableRow>
47 |           )}
48 |         </TableHead>
49 |         <TableBody>
50 |           {api_type === API_TYPE_RATING && (
51 |             <TableRow>
52 |               <TableCell>{grade_map_rating[data.Grade]}</TableCell>
53 |               <TableCell>{data.Explanation}</TableCell>
54 |             </TableRow>
55 |           )}
56 |           {api_type === API_TYPE_SIMILARITY && (
57 |             <TableRow>
58 |               <TableCell>{grade_map_similarity[data.Grade]}</TableCell>
59 |               <TableCell>{data.Explanation}</TableCell>
60 |             </TableRow>
61 |           )}
62 |           {(api_type === API_TYPE_MULTITURN || api_type === API_TYPE_SINGLETURN) && (
63 |             <TableRow>
64 |               <TableCell>{grade_map_multiturn[data.Grade]}</TableCell>
65 |             </TableRow>
66 |           )}
67 |         </TableBody>
68 |       </Table>
69 |     </Paper>
70 |   );
71 | };
72 | 
73 | export default SoloResult;
74 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/RatingSimilarityDataGrid.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import React from "react";
 3 | import { DataGrid } from "@mui/x-data-grid";
 4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
 5 | import { API_TYPE_RATING, grade_map_rating, grade_map_similarity } from "@/services/Config";
 6 | 
 7 | const RatingSimilarityDataGrid = ({ serverData }) => {
 8 |   const columns = [
 9 |     {
10 |       field: "id",
11 |       headerName: "Id",
12 |       hide: true,
13 |     },
14 |     {
15 |       field: "name",
16 |       headerName: "Name",
17 |       width: "250",
18 |     },
19 |     {
20 |       field: "eval_type",
21 |       headerName: "Eval Type",
22 |     },
23 |     {
24 |       field: "model",
25 |       headerName: "Model",
26 |       width: "250",
27 |     },
28 |     {
29 |       field: "golden_text",
30 |       headerName: "Golden Text",
31 |       width: "400",
32 |     },
33 |     {
34 |       field: "generated_text",
35 |       headerName: "Generated Text",
36 |       width: "400",
37 |     },
38 |     {
39 |       field: "Grade",
40 |       headerName: "JudgeIt Score",
41 |       width: 100,
42 |     },
43 |     {
44 |       field: "Explanation",
45 |       headerName: "JudgeIt Reasoning",
46 |       width: "400",
47 |     },
48 |   ];
49 | 
50 |   return (
51 |     <div style={{ width: "100%" }}>
52 |       {" "}
53 |       <DataGrid
54 |         {...{
55 |           columns: columns,
56 |           rows: serverData.map((item) => {
57 |             return {
58 |               id: item._id,
59 |               name: item.name,
60 |               eval_type: item.eval_type,
61 |               model: item.content.query.model,
62 |               golden_text: item.content.query.golden_text,
63 |               generated_text: item.content.query.generated_text,
64 |               Grade: (item.eval_type === API_TYPE_RATING) ? grade_map_rating[item.content.result.Grade] : grade_map_similarity[item.content.result.Grade],
65 |               Explanation: item.content.result.Explanation,
66 |             };
67 |           }),
68 |         }}
69 |         density="compact"
70 |         getRowHeight={() => "auto"}
71 |         autoHeight={true}
72 |         initialState={{
73 |           ...{
74 |             columns: columns,
75 |             rows: [],
76 |           }.initialState,
77 |           pagination: { paginationModel: { pageSize: 10 } },
78 |         }}
79 |         pageSizeOptions={[5, 10, 25]}
80 |         slots={{ toolbar: DataGridToolbar }}
81 |       />
82 |     </div>
83 |   );
84 | };
85 | 
86 | export default RatingSimilarityDataGrid;
87 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DataGridMultiTurnConversation.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import React from "react";
 3 | import { DataGrid } from "@mui/x-data-grid";
 4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
 5 | 
 6 | const DataGridMultiTurnConversation = ({ serverData }) => {
 7 |   const columns = [
 8 |     {
 9 |       field: "id",
10 |       headerName: "Id",
11 |       hide: true,
12 |     },
13 |     {
14 |       field: "name",
15 |       headerName: "Name",
16 |       width: "250",
17 |     },
18 |     {
19 |       field: "eval_type",
20 |       headerName: "Eval Type",
21 |     },
22 |     {
23 |       field: "model",
24 |       headerName: "Model",
25 |       width: "250",
26 |     },
27 |     {
28 |       field: "conversation_history",
29 |       headerName: "Conversation history",
30 |       width: "400",
31 |     },
32 |     {
33 |       field: "follow_up_query",
34 |       headerName: "Follow up query",
35 |       width: "400",
36 |     },
37 |     {
38 |       field: "golden_query",
39 |       headerName: "Golden query",
40 |       width: "400",
41 |     },
42 |     {
43 |       field: "rewritten_query",
44 |       headerName: "Rewritten query",
45 |       width: "400",
46 |     },
47 |     {
48 |       field: "Grade",
49 |       headerName: "Grade",
50 |       width: 100,
51 |     }
52 |   ];
53 | 
54 |   return (
55 |     <div style={{height: 350}}>
56 |       {" "}
57 |       <DataGrid
58 |         {...{
59 |           columns: columns,
60 |           rows: serverData.map((item) => {
61 |             return {
62 |               id: item._id,
63 |               name: item.name,
64 |               eval_type: item.eval_type,
65 |               model: item.content.query.model,
66 |               conversation_history: item.content.query.conversation_history,
67 |               follow_up_query: item.content.query.follow_up_query,
68 |               golden_query: item.content.query.golden_query,
69 |               rewritten_query: item.content.query.rewritten_query,
70 |               Grade: item.content.result.Grade
71 |             };
72 |           }),
73 |         }}
74 |         density="compact"
75 |         getRowHeight={() => "auto"}
76 |         autoHeight={true}
77 |         initialState={{
78 |           ...{
79 |             columns: columns,
80 |             rows: [],
81 |           }.initialState,
82 |           pagination: { paginationModel: { pageSize: 10 } },
83 |         }}
84 |         pageSizeOptions={[5, 10, 25]}
85 |         slots={{ toolbar: DataGridToolbar }}
86 |       />
87 |     </div>
88 |   );
89 | };
90 | 
91 | export default DataGridMultiTurnConversation;
92 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DataGridMultiTurnSummary.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import React from "react";
 3 | import { DataGrid } from "@mui/x-data-grid";
 4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
 5 | 
 6 | const DataGridMultiTurnSummaryConversation = ({ serverData }) => {
 7 |   const columns = [
 8 |     {
 9 |       field: "id",
10 |       headerName: "Id",
11 |       hide: true,
12 |     },
13 |     {
14 |       field: "name",
15 |       headerName: "Name",
16 |       width: "250",
17 |     },
18 |     {
19 |       field: "experiment_name",
20 |       headerName: "Experiment Name",
21 |       width: "250",
22 |     },
23 |     {
24 |       field: "eval_type",
25 |       headerName: "Eval Type",
26 |     },
27 |     {
28 |       field: "conversation_history",
29 |       headerName: "Conversation history",
30 |       width: "500",
31 |     },
32 |     {
33 |       field: "follow_up_query",
34 |       headerName: "Follow up query",
35 |       width: "300",
36 |     },
37 |     {
38 |       field: "golden_query",
39 |       headerName: "Golden query",
40 |       width: "300",
41 |     },
42 |     {
43 |       field: "rewritten_query",
44 |       headerName: "Rewritten query",
45 |       width: "300",
46 |     },
47 |     {
48 |       field: "Grade",
49 |       headerName: "JudgeIt Score",
50 |       width: 100,
51 |     }
52 |   ];
53 | 
54 |   return (
55 |     <div style={{height: 350}}>
56 |       {" "}
57 |       <DataGrid
58 |         {...{
59 |           columns: columns,
60 |           rows: serverData.map((item) => {
61 |             return {
62 |               id: item._id,
63 |               name: item.name,
64 |               eval_type: item.eval_type,
65 |               experiment_name: item.experiment_name,
66 |               conversation_history: item.conversation_history,
67 |               follow_up_query: item.follow_up_query,
68 |               golden_query: item.golden_query,
69 |               rewritten_query: item.rewritten_query,
70 |               Grade: (item?.Grade) ? item?.Grade : item?.judgeit_score
71 |             };
72 |           }),
73 |         }}
74 |         density="compact"
75 |         getRowHeight={() => "auto"}
76 |         autoHeight={true}
77 |         initialState={{
78 |           ...{
79 |             columns: columns,
80 |             rows: [],
81 |           }.initialState,
82 |           pagination: { paginationModel: { pageSize: 10 } },
83 |         }}
84 |         pageSizeOptions={[5, 10, 25]}
85 |         slots={{ toolbar: DataGridToolbar }}
86 |       />
87 |     </div>
88 |   );
89 | };
90 | 
91 | export default DataGridMultiTurnSummaryConversation;
92 | 


--------------------------------------------------------------------------------
/JudgeIt-App/utils/Helper.js:
--------------------------------------------------------------------------------
 1 | import {
 2 |   API_TYPE_MULTITURN,
 3 |   API_TYPE_RATING,
 4 |   API_TYPE_SIMILARITY,
 5 |   grade_map_multiturn,
 6 |   grade_map_rating,
 7 |   grade_map_similarity,
 8 | } from "@/services/Config";
 9 | 
10 | export function getRandomInt(max) {
11 |   return Math.floor(Math.random() * max);
12 | }
13 | 
14 | export function generateRandomString(length = 4) {
15 |   const characters =
16 |     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
17 |   let result = "";
18 |   for (let i = 0; i < length; i++) {
19 |     const randomIndex = Math.floor(Math.random() * characters.length);
20 |     result += characters.charAt(randomIndex);
21 |   }
22 |   return result;
23 | }
24 | 
25 | // Function to generate columns dynamically from JSON object keys
26 | export const generateColumns = (jsonObject) => {
27 |   return Object.keys(jsonObject).map((key) => ({
28 |     field: key,
29 |     headerName: rename_grade_explanation_cloumn_name(key), // Capitalize the header
30 |     width: 300, // You can adjust the width or make it dynamic
31 |   }));
32 | };
33 | 
34 | const rename_grade_explanation_cloumn_name = (column_name) => {
35 |   if (column_name === "Grade") {
36 |     return "JudgeIt Score";
37 |   } else if (column_name === "Explanation") {
38 |     return "JudgeIt Reasoning";
39 |   } else {
40 |     return column_name.charAt(0).toUpperCase() + column_name.slice(1);
41 |   }
42 | };
43 | 
44 | // Function to generate rows dynamically from JSON object
45 | export const generateRows = (jsonObject, eval_type) => {
46 |   const firstKey = Object.keys(jsonObject)[0]; // Get the first key to check structure
47 |   const rowIds = Object.keys(jsonObject[firstKey]); // Assuming same structure for all keys
48 | 
49 |   return rowIds.map((_, index) => {
50 |     const rowData = { id: index }; // Initialize row with id
51 |     Object.keys(jsonObject).forEach((field) => {
52 |       rowData[field] = get_rating_label(
53 |         eval_type,
54 |         field,
55 |         jsonObject[field][index]
56 |       ); // Add data for each field
57 |     });
58 |     return rowData;
59 |   });
60 | };
61 | 
62 | const get_rating_label = (eval_type, column_name, value) => {
63 |   if (column_name !== "Grade") return value;
64 | 
65 |   const gradeMap = {
66 |     [API_TYPE_RATING]: grade_map_rating,
67 |     [API_TYPE_SIMILARITY]: grade_map_similarity,
68 |     [API_TYPE_MULTITURN]: grade_map_multiturn,
69 |   };
70 | 
71 |   return gradeMap[eval_type]?.[value] || value;
72 | };
73 | 
74 | export function trimText(text) {
75 |   if (text.length > 15) {
76 |     return text.substring(0, 15) + "..";
77 |   }
78 |   return text;
79 | }
80 | 


--------------------------------------------------------------------------------
/Framework/wml_setup.py:
--------------------------------------------------------------------------------
 1 | from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
 2 | from ibm_watsonx_ai.foundation_models import Model
 3 | 
 4 | #config Watsonx.ai environment
 5 | api_key = ''    
 6 | ibm_cloud_url = 'https://us-south.ml.cloud.ibm.com'
 7 | project_id = ''
 8 | 
 9 | def send_to_watsonxai(prompts,
10 |                     model_id="MIXTRAL",
11 |                     decoding_method="greedy",
12 |                     max_new_tokens=500,
13 |                     min_new_tokens=30,
14 |                     temperature=1.0,
15 |                     repetition_penalty=1.0
16 |                     ):
17 |     if  model_id == "MIXTRAL":
18 |          model_name = "mistralai/mixtral-8x7b-instruct-v01"
19 |     elif model_id == "LLAMA3":
20 |          model_name="meta-llama/llama-3-70b-instruct"
21 |     # Instantiate parameters for text generation
22 |     model_params = {
23 |         GenParams.DECODING_METHOD: decoding_method,
24 |         GenParams.MIN_NEW_TOKENS: min_new_tokens,
25 |         GenParams.MAX_NEW_TOKENS: max_new_tokens,
26 |         GenParams.RANDOM_SEED: 42,
27 |         GenParams.TEMPERATURE: temperature,
28 |         GenParams.REPETITION_PENALTY: repetition_penalty,
29 |     }
30 |     model = Model(
31 |         model_id=model_name,
32 |         params=model_params,
33 |         credentials={
34 | 		"url" : ibm_cloud_url,
35 | 		"apikey" : api_key
36 |             },
37 |         project_id=project_id)
38 | 
39 |     response=model.generate_text(prompts)
40 |     return response
41 | 
42 | 
43 | def send_to_watsonxai_multi_turn(prompts,
44 |                     model_id="MIXTRAL",
45 |                     decoding_method="greedy",
46 |                     max_new_tokens=128,
47 |                     temperature=0.7,
48 |                     repetition_penalty=1.0
49 |                     ):
50 |     if  model_id == "MIXTRAL":
51 |          model_name = "mistralai/mixtral-8x7b-instruct-v01"
52 |     elif model_id == "LLAMA3":
53 |          model_name="meta-llama/llama-3-70b-instruct"
54 |     # Instantiate parameters for text generation
55 |     model_params = {
56 |         GenParams.DECODING_METHOD: decoding_method,
57 |         GenParams.MAX_NEW_TOKENS: max_new_tokens,
58 |         GenParams.RANDOM_SEED: 42,
59 |         GenParams.TEMPERATURE: temperature,
60 |         GenParams.REPETITION_PENALTY: repetition_penalty,
61 |     }
62 |     model = Model(
63 |         model_id=model_name,
64 |         params=model_params,
65 |         credentials={
66 | 		"url" : ibm_cloud_url,
67 | 		"apikey" : api_key
68 |             },
69 |         project_id=project_id)
70 | 
71 |     response=model.generate_text(prompts)
72 |     return response


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/RatingSimilarityDataGridSummary.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import React from "react";
 3 | import { DataGrid } from "@mui/x-data-grid";
 4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
 5 | import { API_TYPE_RATING, grade_map_rating, grade_map_similarity } from "@/services/Config";
 6 | 
 7 | const RatingSimilarityDataGridSummary = ({ serverData }) => {
 8 |   const columns = [
 9 |     {
10 |       field: "id",
11 |       headerName: "Id",
12 |       hide: true,
13 |     },
14 |     {
15 |       field: "Question",
16 |       headerName: "Question",
17 |       width: "250",
18 |     },
19 |     {
20 |       field: "experiment_name",
21 |       headerName: "Experiment Name",
22 |       width: "100",
23 |     },
24 |     {
25 |       field: "name",
26 |       headerName: "Name",
27 |       width: "100",
28 |     },
29 |     {
30 |       field: "eval_type",
31 |       headerName: "Eval Type",
32 |     },
33 |     {
34 |       field: "golden_text",
35 |       headerName: "Golden Text",
36 |       width: "400",
37 |     },
38 |     {
39 |       field: "generated_text",
40 |       headerName: "Generated Text",
41 |       width: "400",
42 |     },
43 |     {
44 |       field: "Grade",
45 |       headerName: "JudgeIt Score",
46 |       width: 100,
47 |     },
48 |     {
49 |       field: "Explanation",
50 |       headerName: "JudgeIt Reasoning",
51 |       width: "400",
52 |     },
53 |   ];
54 | 
55 |   return (
56 |     <div style={{ width: "100%" }}>
57 |       <DataGrid
58 |         {...{
59 |           columns: columns,
60 |           rows: serverData.map((item) => {
61 |             return {
62 |               id: item._id,
63 |               Question: item.question,
64 |               experiment_name: item.experiment_name,
65 |               name: item.name,
66 |               eval_type: item.eval_type,
67 |               golden_text: item.golden_text,
68 |               generated_text: item.generated_text,
69 |               Grade: (item?.Grade) ? item?.Grade : item?.judgeit_score,
70 |               Explanation: (item?.Explanation) ? item?.Grade : item?.judgeit_reasoning
71 |             };
72 |           }),
73 |         }}
74 |         density="compact"
75 |         getRowHeight={() => "auto"}
76 |         autoHeight={true}
77 |         initialState={{
78 |           ...{
79 |             columns: columns,
80 |             rows: [],
81 |           }.initialState,
82 |           pagination: { paginationModel: { pageSize: 10 } },
83 |         }}
84 |         pageSizeOptions={[5, 10, 25]}
85 |         slots={{ toolbar: DataGridToolbar }}
86 |       />
87 |     </div>
88 |   );
89 | };
90 | 
91 | export default RatingSimilarityDataGridSummary;
92 | 


--------------------------------------------------------------------------------
/JudgeIt-App/app/globals.css:
--------------------------------------------------------------------------------
  1 | :root {
  2 |   --max-width: 1100px;
  3 |   --border-radius: 12px;
  4 |   --font-mono: ui-monospace, Menlo, Monaco, "Cascadia Mono", "Segoe UI Mono",
  5 |     "Roboto Mono", "Oxygen Mono", "Ubuntu Monospace", "Source Code Pro",
  6 |     "Fira Mono", "Droid Sans Mono", "Courier New", monospace;
  7 | 
  8 |   --foreground-rgb: 0, 0, 0;
  9 |   --background-start-rgb: 214, 219, 220;
 10 |   --background-end-rgb: 255, 255, 255;
 11 | 
 12 |   --primary-glow: conic-gradient(
 13 |     from 180deg at 50% 50%,
 14 |     #16abff33 0deg,
 15 |     #0885ff33 55deg,
 16 |     #54d6ff33 120deg,
 17 |     #0071ff33 160deg,
 18 |     transparent 360deg
 19 |   );
 20 |   --secondary-glow: radial-gradient(
 21 |     rgba(255, 255, 255, 1),
 22 |     rgba(255, 255, 255, 0)
 23 |   );
 24 | 
 25 |   --tile-start-rgb: 239, 245, 249;
 26 |   --tile-end-rgb: 228, 232, 233;
 27 |   --tile-border: conic-gradient(
 28 |     #00000080,
 29 |     #00000040,
 30 |     #00000030,
 31 |     #00000020,
 32 |     #00000010,
 33 |     #00000010,
 34 |     #00000080
 35 |   );
 36 | 
 37 |   --callout-rgb: 238, 240, 241;
 38 |   --callout-border-rgb: 172, 175, 176;
 39 |   --card-rgb: 180, 185, 188;
 40 |   --card-border-rgb: 131, 134, 135;
 41 | }
 42 | 
 43 | @media (prefers-color-scheme: dark) {
 44 |   :root {
 45 |     --foreground-rgb: 255, 255, 255;
 46 |     --background-start-rgb: 0, 0, 0;
 47 |     --background-end-rgb: 0, 0, 0;
 48 | 
 49 |     --primary-glow: radial-gradient(rgba(1, 65, 255, 0.4), rgba(1, 65, 255, 0));
 50 |     --secondary-glow: linear-gradient(
 51 |       to bottom right,
 52 |       rgba(1, 65, 255, 0),
 53 |       rgba(1, 65, 255, 0),
 54 |       rgba(1, 65, 255, 0.3)
 55 |     );
 56 | 
 57 |     --tile-start-rgb: 2, 13, 46;
 58 |     --tile-end-rgb: 2, 5, 19;
 59 |     --tile-border: conic-gradient(
 60 |       #ffffff80,
 61 |       #ffffff40,
 62 |       #ffffff30,
 63 |       #ffffff20,
 64 |       #ffffff10,
 65 |       #ffffff10,
 66 |       #ffffff80
 67 |     );
 68 | 
 69 |     --callout-rgb: 20, 20, 20;
 70 |     --callout-border-rgb: 108, 108, 108;
 71 |     --card-rgb: 100, 100, 100;
 72 |     --card-border-rgb: 200, 200, 200;
 73 |   }
 74 | }
 75 | 
 76 | * {
 77 |   box-sizing: border-box;
 78 |   padding: 0;
 79 |   margin: 0;
 80 | }
 81 | 
 82 | html,
 83 | body {
 84 |   max-width: 100vw;
 85 |   overflow-x: hidden;
 86 | }
 87 | 
 88 | body {
 89 |   color: rgb(var(--foreground-rgb));
 90 |   background: linear-gradient(
 91 |       to bottom,
 92 |       transparent,
 93 |       rgb(var(--background-end-rgb))
 94 |     )
 95 |     rgb(var(--background-start-rgb));
 96 | }
 97 | 
 98 | a {
 99 |   color: inherit;
100 |   text-decoration: none;
101 | }
102 | 
103 | @media (prefers-color-scheme: dark) {
104 |   html {
105 |     color-scheme: dark;
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DataGridSingleTurn.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import React from "react";
 3 | import { DataGrid } from "@mui/x-data-grid";
 4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
 5 | 
 6 | const DataGridSingleTurn = ({ serverData }) => {
 7 |   const columns = [
 8 |     {
 9 |       field: "id",
10 |       headerName: "Id",
11 |       hide: true,
12 |     },
13 |     {
14 |       field: "name",
15 |       headerName: "Name",
16 |       width: "250",
17 |     },
18 |     {
19 |       field: "eval_type",
20 |       headerName: "Eval Type",
21 |     },
22 |     {
23 |       field: "model",
24 |       headerName: "Model",
25 |       width: "250",
26 |     },
27 |     {
28 |       field: "previous_question",
29 |       headerName: "Previous Question",
30 |       width: "400",
31 |     },
32 |     {
33 |       field: "previous_answer",
34 |       headerName: "Previous Answer",
35 |       width: "400",
36 |     },
37 |     {
38 |       field: "current_question",
39 |       headerName: "Current Question",
40 |       width: "400",
41 |     },
42 |     {
43 |       field: "golden_rewritten_question",
44 |       headerName: "Golden Rewritten Question",
45 |       width: "400",
46 |     },
47 |     {
48 |       field: "rewritten_question",
49 |       headerName: "Rewritten Question",
50 |       width: "400",
51 |     },
52 |     {
53 |       field: "Grade",
54 |       headerName: "Grade",
55 |       width: 100,
56 |     }
57 |   ];
58 | 
59 |   return (
60 |     <div style={{height: 350}}>
61 |       {" "}
62 |       <DataGrid
63 |         {...{
64 |           columns: columns,
65 |           rows: serverData.map((item) => {
66 |             return {
67 |               id: item._id,
68 |               name: item.name,
69 |               eval_type: item.eval_type,
70 |               model: item.content.query.model,
71 |               previous_question: item.content.query.previous_question,
72 |               previous_answer: item.content.query.previous_answer,
73 |               current_question: item.content.query.current_question,
74 |               golden_rewritten_question: item.content.query.golden_rewritten_question,
75 |               rewritten_question: item.content.query.rewritten_question,
76 |               Grade: item.content.result.Grade
77 |             };
78 |           }),
79 |         }}
80 |         density="compact"
81 |         getRowHeight={() => "auto"}
82 |         autoHeight={true}
83 |         initialState={{
84 |           ...{
85 |             columns: columns,
86 |             rows: [],
87 |           }.initialState,
88 |           pagination: { paginationModel: { pageSize: 10 } },
89 |         }}
90 |         pageSizeOptions={[5, 10, 25]}
91 |         slots={{ toolbar: DataGridToolbar }}
92 |       />
93 |     </div>
94 |   );
95 | };
96 | 
97 | export default DataGridSingleTurn;
98 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DataGridSingleTurnSummary.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import React from "react";
 3 | import { DataGrid } from "@mui/x-data-grid";
 4 | import DataGridToolbar from "@/components/globals/DataGridToolbar";
 5 | 
 6 | const DataGridSingleTurnSummary = ({ serverData }) => {
 7 |   const columns = [
 8 |     {
 9 |       field: "id",
10 |       headerName: "Id",
11 |       hide: true,
12 |     },
13 |     {
14 |       field: "name",
15 |       headerName: "Name",
16 |       width: "250",
17 |     },
18 |     {
19 |       field: "experiment_name",
20 |       headerName: "Experiment Name",
21 |       width: "250",
22 |     },
23 |     {
24 |       field: "eval_type",
25 |       headerName: "Eval Type",
26 |     },
27 |     
28 |     {
29 |       field: "previous_question",
30 |       headerName: "Previous Question",
31 |       width: "400",
32 |     },
33 |     {
34 |       field: "previous_answer",
35 |       headerName: "Previous Answer",
36 |       width: "400",
37 |     },
38 |     {
39 |       field: "current_question",
40 |       headerName: "Current Question",
41 |       width: "400",
42 |     },
43 |     {
44 |       field: "golden_rewritten_question",
45 |       headerName: "Golden Rewritten Question",
46 |       width: "400",
47 |     },
48 |     {
49 |       field: "rewritten_question",
50 |       headerName: "Rewritten Question",
51 |       width: "400",
52 |     },
53 |     {
54 |       field: "Grade",
55 |       headerName: "JudgeIt Score",
56 |       width: 100,
57 |     }
58 |   ];
59 | 
60 |   return (
61 |     <div style={{height: 350}}>
62 |       {" "}
63 |       <DataGrid
64 |         {...{
65 |           columns: columns,
66 |           rows: serverData.map((item) => {
67 |             return {
68 |               id: item._id,
69 |               name: item.name,
70 |               eval_type: item.eval_type,
71 |               experiment_name: item.experiment_name,
72 |               previous_question: item.previous_question,
73 |               previous_answer: item.previous_answer,
74 |               current_question: item.current_question,
75 |               golden_rewritten_question: item.golden_rewritten_question,
76 |               rewritten_question: item.rewritten_question,
77 |               Grade: (item?.Grade) ? item?.Grade : item?.judgeit_score
78 |             };
79 |           }),
80 |         }}
81 |         density="compact"
82 |         getRowHeight={() => "auto"}
83 |         autoHeight={true}
84 |         initialState={{
85 |           ...{
86 |             columns: columns,
87 |             rows: [],
88 |           }.initialState,
89 |           pagination: { paginationModel: { pageSize: 10 } },
90 |         }}
91 |         pageSizeOptions={[5, 10, 25]}
92 |         slots={{ toolbar: DataGridToolbar }}
93 |       />
94 |     </div>
95 |   );
96 | };
97 | 
98 | export default DataGridSingleTurnSummary;
99 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DisplayRequestHistoryMultiTurn.jsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { Grid, Paper, Box, CircularProgress } from "@mui/material";
 3 | 
 4 | const DisplayRequestHistoryMultiTurnConversation = ({ serverData }) => {
 5 |   return (
 6 |     <>
 7 |       <Grid item xs={12} marginLeft={"25px"}>
 8 |         <Box
 9 |           elevation={2}
10 |           padding={"20px"}
11 |           border={"1px solid grey"}
12 |           borderRadius={"5px"}
13 |         >
14 |           <Grid spacing={2} sx={{ flexGrow: 1 }} container>
15 |             <Grid item xs={3} fontWeight={"bold"}>
16 |               Experiment name:
17 |             </Grid>
18 |             <Grid item xs={9}>
19 |               {serverData.experiment_name}
20 |             </Grid>
21 | 
22 |             <Grid item xs={3} fontWeight={"bold"}>
23 |               Request type:
24 |             </Grid>
25 |             <Grid item xs={9}>
26 |               {serverData.eval_type}
27 |             </Grid>
28 | 
29 |             <Grid item xs={3} fontWeight={"bold"}>
30 |               Conversation History:
31 |             </Grid>
32 |             <Grid item xs={9}>
33 |               {serverData.content.query.conversation_history}
34 |             </Grid>
35 | 
36 |             <Grid item xs={3} fontWeight={"bold"}>
37 |               Follow up query:
38 |             </Grid>
39 |             <Grid item xs={9}>
40 |               {serverData.content.query.follow_up_query}
41 |             </Grid>
42 |             <Grid item xs={3} fontWeight={"bold"}>
43 |               Golden query:
44 |             </Grid>
45 |             <Grid item xs={9}>
46 |               {serverData.content.query.golden_query}
47 |             </Grid>
48 |             <Grid item xs={3} fontWeight={"bold"}>
49 |               Rewritten query:
50 |             </Grid>
51 |             <Grid item xs={9}>
52 |               {serverData.content.query.rewritten_query}
53 |             </Grid>
54 |             <Grid item xs={3} fontWeight={"bold"}>
55 |               Model:
56 |             </Grid>
57 |             <Grid item xs={9}>
58 |               {serverData.content.query.model}
59 |             </Grid>
60 |           </Grid>
61 |         </Box>
62 |       </Grid>
63 | 
64 |       <Grid item xs={12} marginLeft={"25px"} marginTop={"20px"}>
65 |         <Box
66 |           elevation={2}
67 |           padding={"20px"}
68 |           border={"1px solid grey"}
69 |           borderRadius={"5px"}
70 |         >
71 |           <Grid spacing={2} sx={{ flexGrow: 1 }} container>
72 |             <Grid item xs={3} fontWeight={"bold"}>
73 |               Grade:
74 |             </Grid>
75 |             <Grid item xs={9}>
76 |               {serverData.content.result.Grade || serverData.content.result.judgeit_score}
77 |             </Grid>
78 |           </Grid>
79 |         </Box>
80 |       </Grid>
81 |     </>
82 |   );
83 | };
84 | 
85 | export default DisplayRequestHistoryMultiTurnConversation;
86 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/RatingSimilarityForm.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import React from "react";
 3 | import { TextField, Box, Tooltip } from "@mui/material";
 4 | import InfoOutlinedIcon from "@mui/icons-material/InfoOutlined";
 5 | 
 6 | const RatingSimilarityForm = ({
 7 |   values,
 8 |   handleChange,
 9 |   handleBlur,
10 |   errors,
11 |   touched,
12 | }) => {
13 |   return (
14 |     <div>
15 |       <Box margin={"20px"} display={"flex"} flexDirection={"row"}>
16 |         <TextField
17 |           label="Question"
18 |           name="question"
19 |           value={values.question}
20 |           onChange={handleChange}
21 |           onBlur={handleBlur}
22 |           error={touched.question && Boolean(errors.question)}
23 |           helperText={touched.question && errors.question}
24 |           style={{ width: "100%" }}
25 |         />
26 |         <Tooltip title="LLM Query." sx={{ marginLeft: "5px", cursor: "help" }}>
27 |           <InfoOutlinedIcon />
28 |         </Tooltip>
29 |       </Box>
30 |       <Box margin={"20px"} display={"flex"} flexDirection={"row"}>
31 |         <TextField
32 |           label="Golden Text"
33 |           name="golden_text"
34 |           value={values.golden_text}
35 |           onChange={handleChange}
36 |           onBlur={handleBlur}
37 |           error={touched.golden_text && Boolean(errors.golden_text)}
38 |           helperText={touched.golden_text && errors.golden_text}
39 |           style={{ width: "100%" }}
40 |           multiline
41 |           rows={"4"}
42 |         />
43 |         <Tooltip
44 |           title="The reference or target text provided as the correct answer or expected output. Used to evaluate the quality and accuracy of generated text against an ideal standard."
45 |           sx={{ marginLeft: "5px", cursor: "help" }}
46 |         >
47 |           <InfoOutlinedIcon />
48 |         </Tooltip>
49 |       </Box>
50 |       <Box
51 |         marginBottom={"20px"}
52 |         marginLeft={"20px"}
53 |         marginRight={"20px"}
54 |         display={"flex"}
55 |         flexDirection={"row"}
56 |       >
57 |         <TextField
58 |           label="LLM Response"
59 |           name="generated_text"
60 |           value={values.generated_text}
61 |           onChange={handleChange}
62 |           onBlur={handleBlur}
63 |           error={touched.generated_text && Boolean(errors.generated_text)}
64 |           helperText={touched.generated_text && errors.generated_text}
65 |           style={{ width: "100%" }}
66 |           multiline
67 |           rows={"4"}
68 |         />
69 |         <Tooltip
70 |           title="The output produced by the language model. Compared against the golden text to assess coherence, correctness, and alignment with the expected result."
71 |           sx={{ marginLeft: "5px", cursor: "help" }}
72 |         >
73 |           <InfoOutlinedIcon />
74 |         </Tooltip>
75 |       </Box>
76 |     </div>
77 |   );
78 | };
79 | 
80 | export default RatingSimilarityForm;
81 | 


--------------------------------------------------------------------------------
/REST-Service/app/src/services/WatsonXService.py:
--------------------------------------------------------------------------------
 1 | from ibm_watson_machine_learning.foundation_models import Model
 2 | from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
 3 | 
 4 | from ibm_watsonx_ai.foundation_models import Model
 5 | from langchain_ibm import WatsonxLLM
 6 | from langchain_core.prompts import PromptTemplate
 7 | 
 8 | class WatsonXService:
 9 | 
10 |     def __init__(self,
11 |                  api_key, 
12 |                  project_id, 
13 |                  llm_model_id) -> None:
14 |         self.api_key        = api_key  
15 |         self.ibm_cloud_url  = 'https://us-south.ml.cloud.ibm.com'
16 |         self.project_id     = project_id
17 |         self.llm_model_id   = llm_model_id
18 | 
19 |     def get_wml_llm_services(self,
20 |             decoding_method="greedy",
21 |             min_new_tokens=1,
22 |             max_new_tokens=200,
23 |             repetition_penalty=1,
24 |             stop_sequences=['}']) -> WatsonxLLM:
25 |         
26 |          # llm parameters
27 |         generate_parameters = {
28 |             "decoding_method": decoding_method,
29 |             "min_new_tokens": min_new_tokens,
30 |             "max_new_tokens": max_new_tokens,
31 |             "repetition_penalty": repetition_penalty,
32 |             "stop_sequences": stop_sequences
33 |         }
34 | 
35 |         # instatiate llm
36 |         llm_model = WatsonxLLM(apikey=self.api_key,
37 |                             url=self.ibm_cloud_url,
38 |                             project_id=self.project_id,
39 |                             model_id=self.llm_model_id,
40 |                             params=generate_parameters)
41 |         return llm_model
42 | 
43 |     ## using watsonx machine learning api
44 |     def send_to_watsonxai(
45 |                         self,
46 |                         prompts,
47 |                         model_id="meta-llama/llama-3-70b-instruct",
48 |                         decoding_method="greedy",
49 |                         max_new_tokens=500,
50 |                         min_new_tokens=30,
51 |                         temperature=1.0,
52 |                         repetition_penalty=1.0
53 |                         ):
54 |         
55 |         # Instantiate parameters for text generation
56 |         model_params = {
57 |             GenParams.DECODING_METHOD: decoding_method,
58 |             GenParams.MIN_NEW_TOKENS: min_new_tokens,
59 |             GenParams.MAX_NEW_TOKENS: max_new_tokens,
60 |             GenParams.RANDOM_SEED: 42,
61 |             GenParams.TEMPERATURE: temperature,
62 |             GenParams.REPETITION_PENALTY: repetition_penalty,
63 |         }
64 | 
65 |         model = Model(
66 |             model_id=model_id,
67 |             params=model_params,
68 |             credentials={
69 |             "url" : self.ibm_cloud_url,
70 |             "apikey" : self.api_key
71 |                 },
72 |             project_id=self.project_id)
73 | 
74 |         response=model.generate_text(prompts)
75 |         return response
76 |     


--------------------------------------------------------------------------------
/REST-Service/deployment/readme.md:
--------------------------------------------------------------------------------
  1 | # Deploy REST Service in OpenShift cluster
  2 | 
  3 | ## Login to OpenShift cluster
  4 | 
  5 | Step 1: Login to openshift console and copy login command
  6 | 
  7 | <img width="652" alt="image" src="https://github.com/user-attachments/assets/8c8a174e-4a9f-4c82-abdf-c9adc3f6e410">
  8 | 
  9 | Login with the token or user user and password in the command line
 10 | 
 11 | ## Deployment steps
 12 | 
 13 | - Create a new project
 14 | 
 15 | ```sh
 16 | oc new-project llm-judge
 17 | ```
 18 | 
 19 | - Set the project name in a variable
 20 | 
 21 | ```sh
 22 | export $NAMESPACE_NAME='llm-judge'
 23 | ```
 24 | 
 25 | - We are using the OpenShift internal registry; however, you can use any container registry.
 26 | 
 27 | ```sh
 28 | export REGISTRY=$(oc get routes -n openshift-image-registry -o jsonpath='{.items[0].spec.host}')
 29 | echo $(oc whoami -t) | docker login $REGISTRY -u $(oc whoami) --password-stdin
 30 | ```
 31 | 
 32 | - Build the docker image and push it to internal registry
 33 | 
 34 | ```sh
 35 | docker build -t $REGISTRY/$NAMESPACE_NAME/backend:v1.0 .
 36 | docker push $REGISTRY/$NAMESPACE_NAME/backend:v1.0
 37 | ```
 38 | 
 39 | - We have a deployment directory with kustomization. Before you applying the deployment please edit [base/kustomize.yaml](base/kustomization.yaml) file and update the below variables based on the values you have.
 40 | 
 41 |     - WATSONX_URL=
 42 |     - WX_PROJECT_ID=
 43 |     - IBM_CLOUD_API_KEY=
 44 |     - LLM_JUDGE_API_KEY=JudgeIt-Secret-Api-Key
 45 |     - WX_PLATFORM=saas
 46 |     - WX_USER=
 47 |     - CELERY_BROKER_URL=redis://redis:6379/0
 48 |     - CELERY_RESULT_BACKEND=redis://redis:6379/0
 49 |     - SERVER_URL=
 50 |     - MONGO_URL=
 51 |     - MONGO_USER=
 52 |     - MONGO_PASS=
 53 |     - MONGO_DB="judgeit_app"
 54 | 
 55 | ```yaml
 56 | kind: Kustomization
 57 | images:
 58 |   - name: backend-image-name
 59 |     newName: image-registry.openshift-image-registry.svc:5000/llm-judge-dev/backend
 60 |     newTag: v1.0
 61 | secretGenerator:
 62 | - name: llm-judge-secret
 63 |   literals:
 64 |     - WATSONX_URL=
 65 |     - WX_PROJECT_ID=
 66 |     - IBM_CLOUD_API_KEY=
 67 |     - LLM_JUDGE_API_KEY=JudgeIt-Secret-Api-Key
 68 |     - WX_PLATFORM=saas
 69 |     - WX_USER=
 70 |     - CELERY_BROKER_URL=redis://redis:6379/0
 71 |     - CELERY_RESULT_BACKEND=redis://redis:6379/0
 72 |     - SERVER_URL=
 73 |     - MONGO_URL=
 74 |     - MONGO_USER=
 75 |     - MONGO_PASS=
 76 |     - MONGO_DB="judgeit_app"
 77 | resources:
 78 |   - redis/
 79 |   - celery-worker/
 80 |   - flower/
 81 |   - rest-app/
 82 | ```
 83 | 
 84 | - Apply the deployment
 85 | 
 86 | ```sh
 87 | oc apply -k base/
 88 | ```
 89 | 
 90 | - Monitor the deployment
 91 | 
 92 | ```sh
 93 | watch oc get deployments,pods
 94 | ```
 95 | 
 96 | - Test
 97 | 
 98 | Copy the url from the command executed below and paste it in the browser.
 99 | 
100 | ```sh
101 | oc get routes/llm-judge-backend -o jsonpath='https://{.spec.host}/docs{"\n"}'
102 | ```
103 | 
104 | - Clean up
105 | 
106 | ```sh
107 | oc delete -k base/
108 | ```
109 | 


--------------------------------------------------------------------------------
/REST-Service/app/src/services/answer_similarity.py:
--------------------------------------------------------------------------------
 1 | from langchain_core.prompts import PromptTemplate
 2 | 
 3 | ## Grading a generated text compared to a golden text
 4 | SIMILARITY_PROMPT= """Follow these structured steps to accurately assess the similarity between a Golden Text and a Generated Text:
 5 | 1. **Role and Task**: Assume the role of an impartial assistant and evaluator. Your task is to assess the similarity between a Golden Text and a Generated Text using the provided information.
 6 | 2. **Initial Setup**: Begin by carefully reviewing the Golden Text to understand the key information, entities, and intents it contains. The Golden Text is considered fully correct and comprehensive. Then, examine the Generated Text that needs evaluation.
 7 | 3. **Evaluation Criteria**: Evaluate the Generated Text based on the following criteria:
 8 |     - Output {{"Grade": "1"}} if:
 9 |       a) The Generated Text matches the Golden Text closely in terms of key entities and intents. Note that these may be worded differently but convey the same meaning.
10 |       b) The Generated Text contains all the essential information from the Golden Text, even if presented in a different order or with slight variations in phrasing.
11 |       c) The Generated Text includes the core information from the Golden Text and may contain additional relevant details or expansions that don't contradict the original.
12 |     - Output {{"Grade": "0"}} if:
13 |       a) The Generated Text is missing critical entities or intents that are present in the Golden Text.
14 |       b) The Generated Text contains significant factual errors or contradictions when compared to the Golden Text.
15 |       c) The overall meaning or intent of the Generated Text substantially differs from the Golden Text.
16 | 4. **Tolerance for Minor Differences**: Allow for minor differences in numerical values, slight variations in proper nouns, and small discrepancies in less critical details, as long as the core meaning and primary facts remain intact.
17 | 5. **Explanation**: After providing the grade, explain your reasoning in 1 sentence, highlighting key similarities or differences that influenced your decision.
18 | 6. **Output Format**: Format your evaluation output strictly as {{"Grade": "evaluated grade", "Explanation": "explanation for grade"}} to ensure clarity and consistency in assessment.
19 | Remember, the goal is to identify substantive similarity rather than expecting word-for-word matches. Focus on the core information, key facts, and overall intent when making your assessment.
20 | 
21 | Input:
22 | Golden Text: {prompt_parameter_1}
23 | Generated Text: {prompt_parameter_2}
24 | 
25 | Output:
26 | """
27 | 
28 | def build_query_similarity_prompt(row):
29 |     input_variables = ['prompt_parameter_1', 'prompt_parameter_2']
30 |     prompt = PromptTemplate(input_variables=input_variables, template=SIMILARITY_PROMPT)
31 |     # create invoke parameter which is a dictionary of your prompt parameters
32 |     prompt_data = {'prompt_parameter_1': row['golden_text'],
33 |                     'prompt_parameter_2': row['generated_text']}
34 |     
35 |     return prompt, prompt_data


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/BarChart.jsx:
--------------------------------------------------------------------------------
  1 | import React from 'react';
  2 | import { Bar } from 'react-chartjs-2';
  3 | import { Chart as ChartJS, CategoryScale, LinearScale, BarElement, Title, Tooltip, Legend } from 'chart.js';
  4 | import { API_TYPE_RATING, API_TYPE_SIMILARITY, API_TYPE_MULTITURN } from "@/services/Config";
  5 | import ChartDataLabels from 'chartjs-plugin-datalabels';
  6 | 
  7 | ChartJS.register(CategoryScale, LinearScale, BarElement, Title, Tooltip, Legend, ChartDataLabels);
  8 | 
  9 | const BarChart = ({ gradeData, gradeType }) => {
 10 |   const totalCount = Object.values(gradeData).reduce((sum, count) => sum + count, 0);
 11 | 
 12 |   const mapGradeLabels = (label) => {
 13 |     const labelMaps = {
 14 |       [API_TYPE_RATING]: {
 15 |         '1': 'Incorrect',
 16 |         '2': 'Partially Correct',
 17 |         '3': 'Correct'
 18 |       },
 19 |       [API_TYPE_SIMILARITY]: {
 20 |         '0': 'Incorrect',
 21 |         '1': 'Correct'
 22 |       },
 23 |       [API_TYPE_MULTITURN]: {
 24 |         '0': 'Incorrect',
 25 |         '1': 'Correct'
 26 |       }
 27 |     };
 28 | 
 29 |     return labelMaps[gradeType]?.[label] || label;
 30 |   };
 31 | 
 32 |   const data = {
 33 |     labels: Object.keys(gradeData).map(mapGradeLabels),
 34 |     datasets: [
 35 |       {
 36 |         label: 'Count',
 37 |         data: Object.values(gradeData),
 38 |         backgroundColor: 'rgba(144, 202, 249, 0.6)',
 39 |         borderColor: 'rgba(144, 202, 249, 1)',
 40 |         borderWidth: 1,
 41 |       },
 42 |     ],
 43 |   };
 44 | 
 45 |   const options = {
 46 |     responsive: true,
 47 |     maintainAspectRatio: false,
 48 |     scales: {
 49 |       x: {
 50 |         title: {
 51 |           display: true,
 52 |           text: 'JudgeIt Score',
 53 |           font: {
 54 |             size: 14,
 55 |             weight: 'bold',
 56 |           },
 57 |         },
 58 |       },
 59 |       y: {
 60 |         title: {
 61 |           display: true,
 62 |           text: 'Count',
 63 |           font: {
 64 |             size: 14,
 65 |             weight: 'bold',
 66 |           },
 67 |         },
 68 |         beginAtZero: true,
 69 |       },
 70 |     },
 71 |     plugins: {
 72 |       tooltip: {
 73 |         callbacks: {
 74 |           label: (context) => {
 75 |             const count = context.raw;
 76 |             const percentage = ((count / totalCount) * 100).toFixed(2);
 77 |             return `Count: ${count} (${percentage}%)`;
 78 |           },
 79 |         },
 80 |       },
 81 |       datalabels: {
 82 |         color: 'black',          // Label color
 83 |         anchor: 'end',           // Positioning of the label
 84 |         align: 'top',            // Align the label at the top
 85 |         font: {
 86 |           weight: 'bold',
 87 |           size: 12,
 88 |         },
 89 |         formatter: (value) => "Count: " + value,  // Format the value as you want
 90 |       },
 91 |     },
 92 |   };
 93 | 
 94 |   return (
 95 |     <div style={{ width: '100%', height: '400px' }}>
 96 |       <Bar data={data} options={options} />
 97 |     </div>
 98 |   );
 99 | };
100 | 
101 | export default BarChart;


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DisplayRequestHistorySingleTurn.jsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { Grid, Paper, Box, CircularProgress } from "@mui/material";
 3 | 
 4 | const DisplayRequestHistorySingleTurn = ({ serverData }) => {
 5 |   return (
 6 |     <>
 7 |       <Grid item xs={12} marginLeft={"25px"}>
 8 |         <Box
 9 |           elevation={2}
10 |           padding={"20px"}
11 |           border={"1px solid grey"}
12 |           borderRadius={"5px"}
13 |         >
14 |           <Grid spacing={2} sx={{ flexGrow: 1 }} container>
15 |             <Grid item xs={3} fontWeight={"bold"}>
16 |               Experiment name:
17 |             </Grid>
18 |             <Grid item xs={9}>
19 |               {serverData.experiment_name}
20 |             </Grid>
21 | 
22 |             <Grid item xs={3} fontWeight={"bold"}>
23 |               Request type:
24 |             </Grid>
25 |             <Grid item xs={9}>
26 |               {serverData.eval_type}
27 |             </Grid>
28 | 
29 |             <Grid item xs={3} fontWeight={"bold"}>
30 |               Previous question:
31 |             </Grid>
32 |             <Grid item xs={9}>
33 |               {serverData.content.query.previous_question}
34 |             </Grid>
35 | 
36 |             <Grid item xs={3} fontWeight={"bold"}>
37 |               Previous answer:
38 |             </Grid>
39 |             <Grid item xs={9}>
40 |               {serverData.content.query.previous_answer}
41 |             </Grid>
42 |             <Grid item xs={3} fontWeight={"bold"}>
43 |               Current question:
44 |             </Grid>
45 |             <Grid item xs={9}>
46 |               {serverData.content.query.current_question}
47 |             </Grid>
48 |             <Grid item xs={3} fontWeight={"bold"}>
49 |               Golden rewritten question:
50 |             </Grid>
51 |             <Grid item xs={9}>
52 |               {serverData.content.query.golden_rewritten_question}
53 |             </Grid>
54 |             <Grid item xs={3} fontWeight={"bold"}>
55 |               Rewritten question:
56 |             </Grid>
57 |             <Grid item xs={9}>
58 |               {serverData.content.query.rewritten_question}
59 |             </Grid>
60 |             <Grid item xs={3} fontWeight={"bold"}>
61 |               Model:
62 |             </Grid>
63 |             <Grid item xs={9}>
64 |               {serverData.content.query.model}
65 |             </Grid>
66 |           </Grid>
67 |         </Box>
68 |       </Grid>
69 | 
70 |       <Grid item xs={12} marginLeft={"25px"} marginTop={"20px"}>
71 |         <Box
72 |           elevation={2}
73 |           padding={"20px"}
74 |           border={"1px solid grey"}
75 |           borderRadius={"5px"}
76 |         >
77 |           <Grid spacing={2} sx={{ flexGrow: 1 }} container>
78 |             <Grid item xs={3} fontWeight={"bold"}>
79 |               Grade:
80 |             </Grid>
81 |             <Grid item xs={9}>
82 |               {serverData.content.result.Grade || serverData.content.result.judgeit_score}
83 |             </Grid>
84 |           </Grid>
85 |         </Box>
86 |       </Grid>
87 |     </>
88 |   );
89 | };
90 | 
91 | export default DisplayRequestHistorySingleTurn;
92 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/SingleTurnForm.jsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | import React from "react";
 3 | import { TextField, Box } from "@mui/material";
 4 | 
 5 | const SingleTurnForm = ({
 6 |   values,
 7 |   handleChange,
 8 |   handleBlur,
 9 |   errors,
10 |   touched,
11 | }) => {
12 |   return (
13 |     <div>
14 |       <Box marginBottom={"20px"} margin={"20px"}>
15 |         <TextField
16 |           label="Previous Question"
17 |           name="previous_question"
18 |           value={values.previous_question}
19 |           onChange={handleChange}
20 |           onBlur={handleBlur}
21 |           error={touched.previous_question && Boolean(errors.previous_question)}
22 |           helperText={touched.previous_question && errors.previous_question}
23 |           style={{ width: "100%" }}
24 |         />
25 |       </Box>
26 |       <Box marginBottom={"20px"} marginLeft={"20px"} marginRight={'20px'}>
27 |         <TextField
28 |           label="Previous Answer "
29 |           name="previous_answer"
30 |           value={values.previous_answer}
31 |           onChange={handleChange}
32 |           onBlur={handleBlur}
33 |           error={touched.previous_answer && Boolean(errors.previous_answer)}
34 |           helperText={touched.previous_answer && errors.previous_answer}
35 |           style={{ width: "100%" }}
36 |           multiline
37 |           rows={"4"}
38 |         />
39 |       </Box>
40 |       <Box marginBottom={"20px"} marginLeft={"20px"} marginRight={'20px'}>
41 |         <TextField
42 |           label="Current Question"
43 |           name="current_question"
44 |           value={values.current_question}
45 |           onChange={handleChange}
46 |           onBlur={handleBlur}
47 |           error={touched.current_question && Boolean(errors.current_question)}
48 |           helperText={touched.current_question && errors.current_question}
49 |           style={{ width: "100%" }}
50 |         />
51 |       </Box>
52 |       <Box marginBottom={"20px"} marginLeft={"20px"} marginRight={'20px'}>
53 |         <TextField
54 |           label="Golden Rewritten Question "
55 |           name="golden_rewritten_question"
56 |           value={values.golden_rewritten_question}
57 |           onChange={handleChange}
58 |           onBlur={handleBlur}
59 |           error={touched.golden_rewritten_question && Boolean(errors.golden_rewritten_question)}
60 |           helperText={touched.golden_rewritten_question && errors.golden_rewritten_question}
61 |           style={{ width: "100%" }}
62 |           multiline
63 |           rows={"4"}
64 |         />
65 |       </Box>
66 |       <Box marginBottom={"20px"} marginLeft={"20px"} marginRight={'20px'}>
67 |         <TextField
68 |           label="Rewritten Question"
69 |           name="rewritten_question"
70 |           value={values.rewritten_question}
71 |           onChange={handleChange}
72 |           onBlur={handleBlur}
73 |           error={touched.rewritten_question && Boolean(errors.rewritten_question)}
74 |           helperText={touched.rewritten_question && errors.rewritten_question}
75 |           style={{ width: "100%" }}
76 |           multiline
77 |           rows={"4"}
78 |         />
79 |       </Box>
80 |     </div>
81 |   );
82 | };
83 | 
84 | export default SingleTurnForm;
85 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/DisplayRequestHistoryRatingSimilarity.jsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { Grid, Box } from "@mui/material";
 3 | import {
 4 |   API_TYPE_RATING,
 5 |   API_TYPE_SIMILARITY,
 6 |   grade_map_rating,
 7 |   grade_map_similarity,
 8 | } from "@/services/Config";
 9 | 
10 | const DisplayRequestHistoryRatingSimilarity = ({ serverData }) => {
11 |   return (
12 |     <>
13 |       <Grid item xs={12} marginLeft={"25px"}>
14 |         <Box
15 |           elevation={2}
16 |           padding={"20px"}
17 |           border={"1px solid grey"}
18 |           borderRadius={"5px"}
19 |         >
20 |           <Grid spacing={2} sx={{ flexGrow: 1 }} container>
21 |             <Grid item xs={3} fontWeight={"bold"}>
22 |               Experiment name:
23 |             </Grid>
24 |             <Grid item xs={9}>
25 |               {serverData.experiment_name}
26 |             </Grid>
27 | 
28 |             <Grid item xs={3} fontWeight={"bold"}>
29 |               Request type:
30 |             </Grid>
31 |             <Grid item xs={9}>
32 |               {serverData.eval_type}
33 |             </Grid>
34 |             <Grid item xs={3} fontWeight={"bold"}>
35 |               Question:
36 |             </Grid>
37 |             <Grid item xs={9}>
38 |               {serverData.content.query.question}
39 |             </Grid>
40 |             <Grid item xs={3} fontWeight={"bold"}>
41 |               Golden Text:
42 |             </Grid>
43 |             <Grid item xs={9}>
44 |               {serverData.content.query.golden_text}
45 |             </Grid>
46 | 
47 |             <Grid item xs={3} fontWeight={"bold"}>
48 |               LLM Response:
49 |             </Grid>
50 |             <Grid item xs={9}>
51 |               {serverData.content.query.generated_text}
52 |             </Grid>
53 |             <Grid item xs={3} fontWeight={"bold"}>
54 |               Model:
55 |             </Grid>
56 |             <Grid item xs={9}>
57 |               {serverData.content.query.model}
58 |             </Grid>
59 |           </Grid>
60 |         </Box>
61 |       </Grid>
62 | 
63 |       <Grid item xs={12} marginLeft={"25px"} marginTop={"20px"}>
64 |         <Box
65 |           elevation={2}
66 |           padding={"20px"}
67 |           border={"1px solid grey"}
68 |           borderRadius={"5px"}
69 |         >
70 |           <Grid spacing={2} sx={{ flexGrow: 1 }} container>
71 |             <Grid item xs={3} fontWeight={"bold"}>
72 |               JudgeIt Score:
73 |             </Grid>
74 |             {API_TYPE_RATING === serverData.eval_type && (
75 |               <Grid item xs={9}>
76 |                 {grade_map_rating[serverData.content.result.Grade]}
77 |               </Grid>
78 |             )}
79 |             {API_TYPE_SIMILARITY === serverData.eval_type && (
80 |               <Grid item xs={9}>
81 |                 {grade_map_similarity[serverData.content.result.Grade]}
82 |               </Grid>
83 |             )}
84 | 
85 |             <Grid item xs={3} fontWeight={"bold"}>
86 |               JudgeIt Reasoning:
87 |             </Grid>
88 |             <Grid item xs={9}>
89 |               {serverData.content.result.Explanation}
90 |             </Grid>
91 |           </Grid>
92 |         </Box>
93 |       </Grid>
94 |     </>
95 |   );
96 | };
97 | 
98 | export default DisplayRequestHistoryRatingSimilarity;
99 | 


--------------------------------------------------------------------------------
/Framework/main.py:
--------------------------------------------------------------------------------
 1 | from answer_similarity import batch_llm_answer_similarity
 2 | from answer_rating import batch_llm_answer_rating
 3 | from multi_turn_eval import batch_llm_multi_turn_eval
 4 | 
 5 | import pandas as pd
 6 | import json
 7 | import configparser
 8 | 
 9 | import chardet 
10 | 
11 | config = configparser.ConfigParser()
12 | config.read('./config.ini')
13 | 
14 | ## Setup the filename and values
15 | home_dir = config['Default']['home_dir']
16 | input_file_name = config['Default']['input_file_name']
17 | output_file_name = config['Default']['output_file_name']
18 | model_id = config['Default']['model_id']
19 | judge_type = config['Default']['judge_type']
20 | 
21 | input_file = home_dir + input_file_name
22 | output_file = home_dir + output_file_name
23 | 
24 | def read_data(input_file):
25 |     ## Read the data for batch processing
26 |     data_df = pd.DataFrame()
27 |     if '.xlsx' in input_file:
28 |         data_df = pd.read_excel(input_file)
29 |     elif '.csv' in input_file:
30 |         with open(input_file, 'rb') as f:
31 |             result = chardet.detect(f.read())
32 |         data_df = pd.read_csv(input_file, encoding=result['encoding'])
33 |     return data_df
34 | 
35 | def write_data(data_df):
36 |     ## save the output
37 |     if '.xlsx' in output_file:
38 |         # write the dataframe to an excel file
39 |         writer = pd.ExcelWriter(output_file, engine='xlsxwriter')
40 |         data_df.to_excel(writer, index=False, sheet_name='Sheet1')
41 |         workbook = writer.book
42 |         worksheet = writer.sheets['Sheet1']
43 |         cell_format = workbook.add_format({'text_wrap': True, 'valign': 'top', 'align': 'left'})
44 |         for i, column in enumerate(data_df.columns):
45 |             worksheet.set_column(i, i, 40, cell_format)
46 |         worksheet.set_column(3, 3, 70, cell_format)
47 |         writer.close()
48 |     elif '.csv' in output_file:
49 |         data_df.to_csv(output_file)
50 |     print("File saved in /JudgeIt-LLM-as-a-Judge/Framework/data/output")
51 | 
52 | 
53 | def batch_llm_multi_turn_eval_caller(input_file):
54 |     input_data = read_data(input_file)
55 |     output_data = batch_llm_multi_turn_eval(model_id, input_data)
56 |     write_data(output_data)
57 |     return output_data
58 | 
59 | def batch_llm_answer_similarity_caller(input_file):
60 |     input_data = read_data(input_file)
61 |     output_data = batch_llm_answer_similarity(model_id, input_data)
62 |     write_data(output_data)
63 |     return output_data
64 | 
65 | def batch_llm_answer_rating_caller(input_file):
66 |     input_data = read_data(input_file)
67 |     output_data = batch_llm_answer_rating(model_id, input_data)
68 |     write_data(output_data)
69 |     return output_data
70 | 
71 | def processing(judge_type):
72 |     if judge_type == 'multi_turn_eval':
73 |         batch_llm_multi_turn_eval_caller(input_file)
74 |     elif judge_type == 'rag_eval_answer_similarity':
75 |         batch_llm_answer_similarity_caller(input_file)
76 |     elif judge_type == 'rag_eval_answer_rating':
77 |         batch_llm_answer_rating_caller(input_file)
78 |     
79 | 
80 | 
81 | 
82 | processing(judge_type)
83 | ## all options basis of tabs
84 | #processing('rating','batch')
85 | # processing('rating','simple')
86 | #processing('similarity','batch')
87 | #processing('similarity','simple')
88 | #processing('multi_turn')


--------------------------------------------------------------------------------
/REST-Service/app/src/services/LLMJudgeService.py:
--------------------------------------------------------------------------------
 1 | from langchain_ibm import WatsonxLLM
 2 | from app.src.services.answer_similarity import build_query_similarity_prompt
 3 | from app.src.services.answer_rating import build_query_rating_prompt
 4 | import json
 5 | from app.src.services.single_turn_eval import build_single_turn_prompt
 6 | from app.src.services.mult_turn_with_conversation_eval import build_multi_turn_prompt
 7 | 
 8 | class LLMJudgeService:
 9 | 
10 |     def __init__(self) -> None:
11 |         pass
12 | 
13 |     def simple_processing_rating(self, golden_text: str, generated_text:str, llm_model: WatsonxLLM):
14 |         
15 |         prompt, prompt_data = build_query_rating_prompt(row={
16 |             "golden_text": golden_text,
17 |             "generated_text": generated_text
18 |         })
19 | 
20 |         llm_chain = prompt | llm_model
21 |         prompt_results = llm_chain.invoke(prompt_data)
22 |         return json.loads(prompt_results)
23 |     
24 |     def simple_processing_similarity_answer(self, golden_text: str, generated_text:str, llm_model: WatsonxLLM):
25 | 
26 |         prompt, prompt_data = build_query_similarity_prompt(row={
27 |             "golden_text": golden_text,
28 |             "generated_text": generated_text
29 |         })
30 | 
31 |         llm_chain = prompt | llm_model
32 | 
33 |         prompt_results = llm_chain.invoke(prompt_data)
34 |         prompt_results = prompt_results.replace("\"1\" or \"0\"", "\"0\"")
35 |         return json.loads(prompt_results)
36 |     
37 |     def single_trun_llm_judge(self,
38 |         previous_question: str,
39 |         previous_answer: str,
40 |         current_question: str,
41 |         golden_rewritten_question: str,
42 |         rewritten_question: str,
43 |         llm_model: WatsonxLLM):
44 | 
45 |         prompt, prompt_data = build_single_turn_prompt(row={
46 |             "previous_question": previous_question,
47 |             "previous_answer": previous_answer,
48 |             "current_question": current_question,
49 |             "golden_rewritten_question": golden_rewritten_question,
50 |             "rewritten_question": rewritten_question
51 |         })
52 |         llm_chain = prompt | llm_model
53 |         prompt_results = {"Grade": None}
54 |         try:
55 |             prompt_results = json.loads(llm_chain.invoke(prompt_data))
56 |         except:
57 |             prompt_results = prompt_results = {
58 |                 "Grade": "Error"
59 |             }
60 | 
61 |         return prompt_results
62 |     
63 |     def multi_trun_llm_judge(self,
64 |         conversation_history: str,
65 |         follow_up_query: str,
66 |         golden_query: str,
67 |         rewritten_query: str,
68 |         llm_model: WatsonxLLM):
69 | 
70 |         prompt, prompt_data = build_multi_turn_prompt(row={
71 |             "conversation_history": conversation_history,
72 |             "follow_up_query": follow_up_query,
73 |             "golden_query": golden_query,
74 |             "rewritten_query": rewritten_query
75 |         })
76 |         llm_chain = prompt | llm_model
77 |         prompt_results = {"Grade": None}
78 |         try:
79 |             prompt_results = json.loads(llm_chain.invoke(prompt_data))
80 |         except:
81 |             prompt_results = prompt_results = {
82 |                 "Grade": "Error"
83 |             }
84 | 
85 |         return prompt_results
86 | 
87 | 


--------------------------------------------------------------------------------
/REST-Service/app/src/services/answer_rating.py:
--------------------------------------------------------------------------------
 1 | from langchain_core.prompts import PromptTemplate
 2 | 
 3 | ## Grading a generated text compared to a golden text
 4 | RATING_PROMPT = """Follow these structured steps to accurately assess the similarity between a Golden Text and a Generated Text:
 5 | 1. **Role and Task**: Assume the role of an impartial assistant and evaluator. Your task is to assess the similarity between a Golden Text and a Generated Text using the provided information.
 6 | 2. **Initial Setup**: Begin by carefully reviewing the Golden Text to understand the key information, entities, and intents it contains. The Golden Text is considered fully correct and comprehensive. Then, examine the Generated Text that needs evaluation.
 7 | 3. **Evaluation Criteria**: Evaluate the Generated Text based on the following criteria:
 8 |     - Output {{"Grade": "1"}} if:
 9 |       a) The Generated Text is missing critical entities or intents that are present in the Golden Text.
10 |       b) The Generated Text contains significant factual errors or contradictions when compared to the Golden Text.
11 |       c) The overall meaning or intent of the Generated Text substantially differs from the Golden Text.
12 |     - Output {{"Grade": "2"}} if:
13 |       a) The Generated Text somewhat matches the Golden Text in terms of key entities and intents. Note that these may be worded differently but convey the same meaning.
14 |       b) The Generated Text contains part of the essential information from the Golden Text, even if presented in a different order or with slight variations in phrasing.
15 |       c) The Generated Text includes part the core information from the Golden Text and may contain additional relevant details or expansions that don't contradict the original.
16 |     - Output {{"Grade": "3"}} if:
17 |       a) The Generated Text matches the Golden Text closely in terms of key entities and intents. Note that these may be worded differently but convey the same meaning.
18 |       b) The Generated Text contains all the essential information from the Golden Text, even if presented in a different order or with slight variations in phrasing.
19 |       c) The Generated Text includes the core information from the Golden Text and may contain additional relevant details or expansions that don't contradict the original.
20 | 4. **Tolerance for Minor Differences**: Allow for minor differences in numerical values, slight variations in proper nouns, and small discrepancies in less critical details, as long as the core meaning and primary facts remain intact.
21 | 5. **Explanation**: After providing the grade, explain your reasoning in 1 sentence, highlighting key similarities or differences that influenced your decision.
22 | 6. **Output Format**: Format your evaluation output strictly as {{"Grade": "evaluated grade", "Explanation": "explanation for grade"}} to ensure clarity and consistency in assessment.
23 | Remember, the goal is to identify substantive similarity rather than expecting word-for-word matches. Focus on the core information, key facts, and overall intent when making your assessment.
24 | 
25 | Input:
26 | Golden Text: {prompt_parameter_1}
27 | Generated Text: {prompt_parameter_2}
28 | 
29 | Output:
30 | """
31 | 
32 | def build_query_rating_prompt(row):
33 |     input_variables = ['prompt_parameter_1', 'prompt_parameter_2']
34 |     prompt = PromptTemplate(input_variables=input_variables, template=RATING_PROMPT)
35 |     # create invoke parameter which is a dictionary of your prompt parameters
36 |     prompt_data = {'prompt_parameter_1': row['golden_text'],
37 |                     'prompt_parameter_2': row['generated_text']}
38 |     
39 |     return prompt, prompt_data


--------------------------------------------------------------------------------
/REST-Service/chart/templates/deployment.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: Deployment
  3 | metadata:
  4 |   name: fastapi-app
  5 |   labels:
  6 |     app: fastapi-app
  7 | spec:
  8 |   replicas: {{ .Values.replicaCount }}
  9 |   selector:
 10 |     matchLabels:
 11 |       app: fastapi-app
 12 |   template:
 13 |     metadata:
 14 |       labels:
 15 |         app: fastapi-app
 16 |     spec:
 17 |       containers:
 18 |         - name: fastapi-app
 19 |           image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
 20 |           imagePullPolicy: {{ .Values.image.pullPolicy }}
 21 |           ports:
 22 |             - containerPort: {{ .Values.service.fastapi.port }}
 23 |           env:
 24 |             - name: WATSONX_URL
 25 |               value: "{{ .Values.env.WATSONX_URL }}"
 26 |             - name: WX_PROJECT_ID
 27 |               value: "{{ .Values.env.WX_PROJECT_ID }}"
 28 |             - name: IBM_CLOUD_API_KEY
 29 |               value: "{{ .Values.env.IBM_CLOUD_API_KEY }}"
 30 |             - name: CELERY_BROKER_URL
 31 |               value: "{{ .Values.env.CELERY_BROKER_URL }}"
 32 |             - name: CELERY_RESULT_BACKEND
 33 |               value: "{{ .Values.env.CELERY_RESULT_BACKEND }}"
 34 | 
 35 | ---
 36 | 
 37 | apiVersion: apps/v1
 38 | kind: Deployment
 39 | metadata:
 40 |   name: redis
 41 |   labels:
 42 |     app: redis
 43 | spec:
 44 |   replicas: {{ .Values.replicaCount }}
 45 |   selector:
 46 |     matchLabels:
 47 |       app: redis
 48 |   template:
 49 |     metadata:
 50 |       labels:
 51 |         app: redis
 52 |     spec:
 53 |       containers:
 54 |         - name: redis
 55 |           image: redis:7.2.5-alpine
 56 |           ports:
 57 |             - containerPort: {{ .Values.service.redis.port }}
 58 | 
 59 | ---
 60 | 
 61 | apiVersion: apps/v1
 62 | kind: Deployment
 63 | metadata:
 64 |   name: celery-worker
 65 |   labels:
 66 |     app: celery-worker
 67 | spec:
 68 |   replicas: {{ .Values.replicaCount }}
 69 |   selector:
 70 |     matchLabels:
 71 |       app: celery-worker
 72 |   template:
 73 |     metadata:
 74 |       labels:
 75 |         app: celery-worker
 76 |     spec:
 77 |       containers:
 78 |         - name: celery-worker
 79 |           image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
 80 |           imagePullPolicy: {{ .Values.image.pullPolicy }}
 81 |           command: ["celery", "-A", "app.celery.celery_worker.celery", "worker", "--loglevel=info"]
 82 |           env:
 83 |             - name: WATSONX_URL
 84 |               value: "{{ .Values.env.WATSONX_URL }}"
 85 |             - name: WX_PROJECT_ID
 86 |               value: "{{ .Values.env.WX_PROJECT_ID }}"
 87 |             - name: IBM_CLOUD_API_KEY
 88 |               value: "{{ .Values.env.IBM_CLOUD_API_KEY }}"
 89 |             - name: CELERY_BROKER_URL
 90 |               value: "{{ .Values.env.CELERY_BROKER_URL }}"
 91 |             - name: CELERY_RESULT_BACKEND
 92 |               value: "{{ .Values.env.CELERY_RESULT_BACKEND }}"
 93 | 
 94 | ---
 95 | 
 96 | apiVersion: apps/v1
 97 | kind: Deployment
 98 | metadata:
 99 |   name: flower
100 |   labels:
101 |     app: flower
102 | spec:
103 |   replicas: {{ .Values.replicaCount }}
104 |   selector:
105 |     matchLabels:
106 |       app: flower
107 |   template:
108 |     metadata:
109 |       labels:
110 |         app: flower
111 |     spec:
112 |       containers:
113 |         - name: flower
114 |           image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
115 |           imagePullPolicy: {{ .Values.image.pullPolicy }}
116 |           command: ["celery", "--broker=redis://redis:6379/0", "flower", "--port=5555"]
117 |           ports:
118 |             - containerPort: {{ .Values.service.flower.port }}
119 |           env:
120 |             - name: CELERY_BROKER_URL
121 |               value: "{{ .Values.env.CELERY_BROKER_URL }}"
122 |             - name: CELERY_RESULT_BACKEND
123 |               value: "{{ .Values.env.CELERY_RESULT_BACKEND }}"
124 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/DrawerMenu.jsx:
--------------------------------------------------------------------------------
  1 | import { Box, Toolbar, Typography } from "@mui/material";
  2 | import Drawer from "@mui/material/Drawer";
  3 | import List from "@mui/material/List";
  4 | import Divider from "@mui/material/Divider";
  5 | import ListItem from "@mui/material/ListItem";
  6 | import ListItemButton from "@mui/material/ListItemButton";
  7 | import ListItemIcon from "@mui/material/ListItemIcon";
  8 | import ListItemText from "@mui/material/ListItemText";
  9 | import HomeOutlinedIcon from "@mui/icons-material/HomeOutlined";
 10 | import LogoutOutlinedIcon from "@mui/icons-material/LogoutOutlined";
 11 | import { signOut } from "next-auth/react";
 12 | import GavelOutlinedIcon from "@mui/icons-material/GavelOutlined";
 13 | import BatchPredictionOutlinedIcon from "@mui/icons-material/BatchPredictionOutlined";
 14 | import HelpCenterOutlinedIcon from "@mui/icons-material/HelpCenterOutlined";
 15 | import { app_labels_and_config } from "@/services/Config";
 16 | 
 17 | const DrawerMenu = ({
 18 |   open,
 19 |   handleDrawwerOpen,
 20 |   handleDrawwerClose,
 21 |   handleLogout,
 22 | }) => {
 23 |   const list = () => (
 24 |     <Box
 25 |       sx={{ width: 300 }}
 26 |       role="presentation"
 27 |       onClick={handleDrawwerClose}
 28 |       onKeyDown={handleDrawwerClose}
 29 |     >
 30 |       <Box
 31 |         display="flex"
 32 |         justifyContent="center"
 33 |         alignItems="center"
 34 |         width="100%"
 35 |         sx={{ textDecoration: "none" }}
 36 |       >
 37 |         <Typography
 38 |           style={{
 39 |             fontSize: "24px",
 40 |             color: "#3B3B3B",
 41 |             margin: "10px",
 42 |             fontWeight: "bold",
 43 |           }}
 44 |         >
 45 |           {app_labels_and_config.app_title}
 46 |         </Typography>
 47 |       </Box>
 48 |       <Divider />
 49 |       <List>
 50 |         <ListItem disablePadding>
 51 |           <ListItemButton href="/">
 52 |             <ListItemIcon>
 53 |               <HomeOutlinedIcon />
 54 |             </ListItemIcon>
 55 |             <ListItemText primary={"Home"} />
 56 |           </ListItemButton>
 57 |         </ListItem>
 58 |       </List>
 59 |       <Divider />
 60 |       <List>
 61 |         <ListItem disablePadding>
 62 |           <ListItemButton href="/pages/single">
 63 |             <ListItemIcon>
 64 |               <GavelOutlinedIcon />
 65 |             </ListItemIcon>
 66 |             <ListItemText
 67 |               primary={app_labels_and_config.buttons.single_page_action}
 68 |             />
 69 |           </ListItemButton>
 70 |         </ListItem>
 71 |         <ListItem disablePadding>
 72 |           <ListItemButton href="/pages/batch">
 73 |             <ListItemIcon>
 74 |               <BatchPredictionOutlinedIcon />
 75 |             </ListItemIcon>
 76 |             <ListItemText
 77 |               primary={app_labels_and_config.buttons.batch_page_action}
 78 |             />
 79 |           </ListItemButton>
 80 |         </ListItem>
 81 |         <ListItem disablePadding>
 82 |           <ListItemButton href="/pages/help">
 83 |             <ListItemIcon>
 84 |               <HelpCenterOutlinedIcon />
 85 |             </ListItemIcon>
 86 |             <ListItemText primary={"Help"} />
 87 |           </ListItemButton>
 88 |         </ListItem>
 89 |       </List>
 90 |       <Divider />
 91 |       <List>
 92 |         <ListItem disablePadding>
 93 |           <ListItemButton
 94 |             onClick={(event) => {
 95 |               signOut({ callbackUrl: "/" });
 96 |             }}
 97 |           >
 98 |             <ListItemIcon>
 99 |               <LogoutOutlinedIcon />
100 |             </ListItemIcon>
101 |             <ListItemText primary={"Logout"} />
102 |           </ListItemButton>
103 |         </ListItem>
104 |       </List>
105 |     </Box>
106 |   );
107 | 
108 |   return (
109 |     <Drawer anchor="right" open={open}>
110 |       {list()}
111 |     </Drawer>
112 |   );
113 | };
114 | 
115 | export default DrawerMenu;
116 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/LeftNavigation.jsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | import { Sidebar, Menu, MenuItem, SubMenu } from "react-pro-sidebar";
  3 | import HomeOutlinedIcon from "@mui/icons-material/HomeOutlined";
  4 | import { Divider, Toolbar, Typography } from "@mui/material";
  5 | import LoginOutlinedIcon from "@mui/icons-material/LoginOutlined";
  6 | import LogoutOutlinedIcon from "@mui/icons-material/LogoutOutlined";
  7 | import CreateNewFolderOutlinedIcon from "@mui/icons-material/CreateNewFolderOutlined";
  8 | import Link from "next/link";
  9 | import GavelOutlinedIcon from "@mui/icons-material/GavelOutlined";
 10 | import BatchPredictionOutlinedIcon from "@mui/icons-material/BatchPredictionOutlined";
 11 | import { useSession, signIn, signOut } from "next-auth/react";
 12 | import { useEffect } from "react";
 13 | 
 14 | function LeftNavBar() {
 15 |   const { data: session, status } = useSession();
 16 | 
 17 |   useEffect(() => {
 18 |     if (
 19 |       status != "loading" &&
 20 |       session &&
 21 |       session?.error === "RefreshAccessTokenError"
 22 |     ) {
 23 |       signOut({ callbackUrl: "/" });
 24 |     }
 25 |   }, [session, status]);
 26 | 
 27 |   return (
 28 |     <>
 29 |       {session && (
 30 |         <Sidebar>
 31 |           <Toolbar style={{ flexDirection: "column" }}>
 32 |             <Typography variant="h5">LLM Judge</Typography>
 33 |             {status === "loading" && (
 34 |               <Typography style={{ fontSize: "12px" }}>Loading..</Typography>
 35 |             )}
 36 |             {session && (
 37 |               <Typography style={{ fontSize: "12px" }}>
 38 |                 Logged in as {session.user.email}
 39 |               </Typography>
 40 |             )}
 41 |           </Toolbar>
 42 |           <Divider />
 43 |           <Menu
 44 |             menuItemStyles={{
 45 |               button: {
 46 |                 // the active class will be added automatically by react router
 47 |                 // so we can use it to style the active menu item
 48 |                 [`&.active`]: {
 49 |                   backgroundColor: "#13395e",
 50 |                   color: "#b6c8d9",
 51 |                 },
 52 |               },
 53 |             }}
 54 |           >
 55 |             <MenuItem
 56 |               icon={<HomeOutlinedIcon />}
 57 |               component={<Link href={"/"} />}
 58 |             >
 59 |               {" "}
 60 |               Home{" "}
 61 |             </MenuItem>
 62 | 
 63 |             {session && (
 64 |               <SubMenu label={"Judge"} defaultOpen icon={<GavelOutlinedIcon />}>
 65 |                 {session && (
 66 |                   <MenuItem
 67 |                     icon={<CreateNewFolderOutlinedIcon />}
 68 |                     component={<Link href={"/pages/solo"} />}
 69 |                   >
 70 |                     Single{" "}
 71 |                   </MenuItem>
 72 |                 )}
 73 |                 {session && (
 74 |                   <MenuItem
 75 |                     icon={<BatchPredictionOutlinedIcon />}
 76 |                     component={<Link href={"/pages/lot"} />}
 77 |                   >
 78 |                     Batch{" "}
 79 |                   </MenuItem>
 80 |                 )}
 81 |               </SubMenu>
 82 |             )}
 83 | 
 84 |             {!session && (
 85 |               <MenuItem
 86 |                 icon={<LoginOutlinedIcon />}
 87 |                 onClick={() => signIn("auth0")}
 88 |               >
 89 |                 Login
 90 |               </MenuItem>
 91 |             )}
 92 | 
 93 |             {session && (
 94 |               <MenuItem
 95 |                 icon={<LogoutOutlinedIcon />}
 96 |                 onClick={() => {
 97 |                   signOut({ callbackUrl: "/" });
 98 |                 }}
 99 |               >
100 |                 Logout
101 |               </MenuItem>
102 |             )}
103 |           </Menu>
104 |         </Sidebar>
105 |       )}
106 |     </>
107 |   );
108 | }
109 | 
110 | export default LeftNavBar;
111 | 


--------------------------------------------------------------------------------
/JudgeIt-App/README.md:
--------------------------------------------------------------------------------
 1 | <!-- ABOUT THE PROJECT -->
 2 | 
 3 | <!-- omit in toc -->
 4 | # JudgeIt Application
 5 | 
 6 | One method of using JudgeIt is through a Service-Oriented Architecture (SOA). This directory contains the code for a React-based application that provides a user interface for interacting with the LLM Judge service. It is built on the Next.js framework and integrates with IBM App ID for authentication. There are three types of evaluation currently available:
 7 | 
 8 | 1. **RAG Evaluation (Similarity)**: evaluate generated text against golden text
 9 | 2. **RAG Evaluation (Rating)**: evaluate generated text against golden text
10 | 3. **Multi-turn evaluation**: evaluate rewritten queries given a mult-turn conversation
11 | 
12 | The JudgeIt framework takes input data in the form of excel or csv files for any of these evaluations.
13 | 
14 | ![LLM-Judges](/images/flow-diagram.png)
15 | 
16 | <!-- omit in toc -->
17 | ## Table of Contents
18 | 
19 | - [Getting Started](#getting-started)
20 |   - [Prerequisites](#prerequisites)
21 |   - [Installation](#installation)
22 | - [Configuring your Input File](#configuring-your-input-file)
23 | - [Understanding the Results](#understanding-the-results)
24 | 
25 | <!-- GETTING STARTED -->
26 | 
27 | ## Getting Started
28 | 
29 | ### Prerequisites
30 | 
31 | The following prerequisites are required to run the tester:
32 | 
33 | 1. [JudgeIt Backend REST Service](/REST-Service/README.md) is up and running
34 | 2. [Node.js](https://nodejs.org/en) v18 or higher
35 | 3. [IBM AppID](https://www.ibm.com/products/app-id) for application authentication
36 | 
37 | ### Installation
38 | 
39 | 1. Change directory into the JudgeIt App
40 | 
41 |    ```bash
42 |    cd JudgeIt-LLM-as-a-Judge/JudgeIt-App
43 |    ```
44 | 
45 | 2. Copy env file to .env
46 | 
47 |    ```bash
48 |    cp env .env
49 |    ```
50 | 
51 | 3. Configure your parameters in .env. Make sure `NEXT_PUBLIC_LLM_JUDGE_API_KEY` value matches with the value assigned in backend service.
52 | 
53 | 4. Install dependencies
54 | 
55 |    ```bash
56 |    npm install
57 |    ```
58 | 
59 | 5. Run the development server
60 | 
61 |    ```bash
62 |    npm run dev
63 |    ```
64 | 
65 | 6. Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
66 | 
67 | ## Configuring your Input File
68 | 
69 | Each type of LLM Judge will accept an excel/csv file as an input file. The repository contains a sample input file for each type of LLM Judge that you can copy, edit, and use to test. They are located at: [JudgeIt-LLM-as-a-Judge/Framework/data/input](../Framework/data/input)
70 | 
71 | 1. RAG Evaluation (Similarity): provide an excel/csv file with a `golden_text` column and `generated_text` column to compare
72 | 2. RAG Evaluation (Rating): provide an excel/csv file with a `golden_text` column and `generated_text` column to compare
73 | 3. Multi-turn Evaluation: provide an excel/csv file with the following columns: `previous_question`, `previous_answer`, `current_question`, `golden_rewritten_question`, and `rewritten_question`
74 | 
75 | Note: Your input files can contain additional columns than the ones specified above. These columns will have no effect on the LLM Judge and will be perserved in the output file.
76 | 
77 | ## Understanding the Results
78 | 
79 | The generated results will be saved to an excel/csv file at the location specified in your config file. Each file will contain all the columns provided in the input file.
80 | 
81 | 1. For RAG Evaluation (Similarity), the LLM Judge will output a `Grade` and `Explanation`. A grade of 0 means the texts are dissimilar, while a grade of 1 means the texts are similar.
82 | 2. For RAG Evaluation (Rating), the LLM Judge will output a `Grade` and `Explanation`. A grade of 1 means the texts are dissimilar, a grade of 2 means the texts are partially similar, and a text of 3 means the texts are significantly similar.
83 | 3. For Multi-turn Evaluation, the LLM Judge will output a `Grade`. A grade of 0 means the golden rewritten question and rewritten question are dissimilar, while a grade of 1 means the questions are similar.
84 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/EvaluationTypeComponent.jsx:
--------------------------------------------------------------------------------
  1 | import {
  2 |   FormControl,
  3 |   FormHelperText,
  4 |   RadioGroup,
  5 |   FormControlLabel,
  6 |   Radio,
  7 |   FormLabel,
  8 | } from "@mui/material";
  9 | import {
 10 |   API_TYPE_MULTITURN,
 11 |   API_TYPE_RATING,
 12 |   API_TYPE_SIMILARITY,
 13 |   API_TYPE_WBOX_SDR,
 14 |   API_TYPE_BBOX_SDR,  
 15 |   API_TYPE_KEY,
 16 |   API_TYPE_SINGLETURN,
 17 |   API_TYPE_AGENT,
 18 | } from "@/services/Config";
 19 | import EvaluationTypeLabel from "@/components/judge/EvaluationTypeLabel";
 20 | 
 21 | const EvaluationTypeComponent = ({
 22 |   values,
 23 |   handleChange,
 24 |   handleBlur,
 25 |   errors,
 26 |   touched,
 27 |   api_call_inprogress
 28 | }) => {
 29 |   return (
 30 |     <div>
 31 |       {" "}
 32 |       <FormControl
 33 |         component="fieldset"
 34 |         error={touched.apiType && Boolean(errors.apiType)}
 35 |         disabled={api_call_inprogress}
 36 |       >
 37 |         <FormLabel id="demo-radio-buttons-group-label">
 38 |           Evaluation Type
 39 |         </FormLabel>
 40 |         <RadioGroup
 41 |           aria-labelledby="demo-radio-buttons-group-label"
 42 |           aria-label="option"
 43 |           name={API_TYPE_KEY}
 44 |           value={values.apiType}
 45 |           onChange={handleChange}
 46 |           onBlur={handleBlur}
 47 |         >
 48 |           <FormControlLabel
 49 |             value={API_TYPE_RATING}
 50 |             control={<Radio />}
 51 |             label={
 52 |               <EvaluationTypeLabel
 53 |                 tooltip="Evaluate generated text against golden text and receive a binary score for similarity"
 54 |                 label={"RAG Evaluation - Answer Rating"}
 55 |               />
 56 |             }
 57 |           />
 58 |           <FormControlLabel
 59 |             value={API_TYPE_SIMILARITY}
 60 |             control={<Radio />}
 61 |             label={
 62 |               <EvaluationTypeLabel
 63 |                 tooltip="Evaluate generated text against golden text and receive a 1/2/3 rating based on degree of similarity"
 64 |                 label={"RAG Evaluation - Answer Similarity"}
 65 |               />
 66 |             }
 67 |           />
 68 |           <FormControlLabel
 69 |             value={API_TYPE_SINGLETURN}
 70 |             control={<Radio />}
 71 |             label={
 72 |               <EvaluationTypeLabel
 73 |                 tooltip="Evaluate rewritten queries given a single turn conversation and receive a binary score for similarity"
 74 |                 label={"Single turn Query Rewrite Evaluation"}
 75 |               />
 76 |             }
 77 |           />
 78 |           <FormControlLabel
 79 |             value={API_TYPE_MULTITURN}
 80 |             control={<Radio />}
 81 |             label={
 82 |               <EvaluationTypeLabel
 83 |                 tooltip="Evaluate rewritten queries given a mult-turn with conversation history and receive a binary score for similarity"
 84 |                 label={"Multi turn Query Rewrite Evaluation"}
 85 |               />
 86 |             }
 87 |           />
 88 |           <FormControlLabel
 89 |             value={API_TYPE_WBOX_SDR}
 90 |             control={<Radio />}
 91 |             label={
 92 |               <EvaluationTypeLabel
 93 |                 tooltip="Run WhiteBox Evaluation for Agentic Solution"
 94 |                 label={"WhiteBox Evaluation - Thought Trail Eval"}
 95 |               />
 96 |             }
 97 |           />
 98 |           <FormControlLabel
 99 |             value={API_TYPE_BBOX_SDR}
100 |             control={<Radio />}
101 |             label={
102 |               <EvaluationTypeLabel
103 |                 tooltip="Run BlackBox Evaluation for Agentic Solution"
104 |                 label={"BlackBox Evaluation - Agent Quality Eval"}
105 |               />
106 |             }
107 |           />
108 |           <FormControlLabel
109 |             value={API_TYPE_AGENT}
110 |             control={<Radio />}
111 |             label={
112 |               <EvaluationTypeLabel
113 |                 tooltip="Run Evaluation for Agentic Solution"
114 |                 label={"Agent Evaluation - Agent Quality and Workflow Eval"}
115 |               />
116 |             }
117 |           />
118 | 
119 |         </RadioGroup>
120 |         {touched.apiType && errors.apiType && (
121 |           <FormHelperText>{errors.apiType}</FormHelperText>
122 |         )}
123 |       </FormControl>
124 |     </div>
125 |   );
126 | };
127 | 
128 | export default EvaluationTypeComponent;
129 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/SingleInstructions.jsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | import { Box, Typography, Button } from "@mui/material";
  3 | import GavelOutlinedIcon from "@mui/icons-material/GavelOutlined";
  4 | 
  5 | function SingleInstructions() {
  6 |   return (
  7 |     <Box sx={{ width: "95%", marginBottom: 2 }}>
  8 |       <Typography
  9 |         style={{
 10 |           fontSize: "30px",
 11 |           color: "#3B3B3B",
 12 |           fontWeight: "bold",
 13 |         }}
 14 |       >
 15 |         <GavelOutlinedIcon fontSize="30px" style={{ marginRight: "10px" }} />
 16 |         Single Answer Evaluation Instructions
 17 |       </Typography>
 18 | 
 19 |       <Typography
 20 |         style={{
 21 |           fontSize: "16px",
 22 |           color: "#3B3B3B",
 23 |           margin: "10px",
 24 |         }}
 25 |       >
 26 |         Evaluate a single input using different LLM Judge types.
 27 |       </Typography>
 28 |       <ol className="list-decimal list-inside mb-4">
 29 |         <li className="mb-2">
 30 |           <Typography
 31 |             style={{
 32 |               fontSize: "16px",
 33 |               color: "#3B3B3B",
 34 |               margin: "10px",
 35 |               fontWeight: "bold",
 36 |             }}
 37 |           >
 38 |             RAG Evaluation (Similarity):
 39 |           </Typography>
 40 |           <ul className="list-disc list-inside ml-4">
 41 |             <li>
 42 |               <b>Function: </b>Compare a golden text to a generated text
 43 |             </li>
 44 |             <li>
 45 |               <b>Input: </b>Provide the following:
 46 |             </li>
 47 | 
 48 |             <ul className="list-none ml-8">
 49 |               <li>golden text</li>
 50 |               <li>generated text</li>
 51 |             </ul>
 52 |             <li>
 53 |               <b>Output: </b>The LLM Judge will output a Grade and Explanation.
 54 |               A grade of 0 means the texts are dissimilar, while a grade of 1
 55 |               means the texts are similar.
 56 |             </li>
 57 |           </ul>
 58 |         </li>
 59 | 
 60 |         <li className="mb-2">
 61 |           <Typography
 62 |             style={{
 63 |               fontSize: "16px",
 64 |               color: "#3B3B3B",
 65 |               margin: "10px",
 66 |               fontWeight: "bold",
 67 |             }}
 68 |           >
 69 |             RAG Evaluation (Rating):
 70 |           </Typography>
 71 |           <ul className="list-disc list-inside ml-4">
 72 |             <li>
 73 |               <b>Function: </b>Compare a golden text to a generated text
 74 |             </li>
 75 |             <li>
 76 |               <b>Input: </b>Provide the following:
 77 |             </li>
 78 |             <ul className="list-none ml-8">
 79 |               <li>golden text</li>
 80 |               <li>generated text</li>
 81 |             </ul>
 82 |             <li>
 83 |               <b>Output: </b>The LLM Judge will output a Grade and Explanation.
 84 |               A grade of 1 means the texts are dissimilar, a grade of 2 means
 85 |               the texts are partially similar, and a text of 3 means the texts
 86 |               are significantly similar
 87 |             </li>
 88 |           </ul>
 89 |         </li>
 90 | 
 91 |         <li className="mb-2">
 92 |           <Typography
 93 |             style={{
 94 |               fontSize: "16px",
 95 |               color: "#3B3B3B",
 96 |               margin: "10px",
 97 |               fontWeight: "bold",
 98 |             }}
 99 |           >
100 |             Multi-turn Evaluation:
101 |           </Typography>
102 |           <ul className="list-disc list-inside ml-4">
103 |             <li>
104 |               <b>Function: </b>Compare a golden rewritten query to a rewritten
105 |               query based on a multi-turn conversation
106 |             </li>
107 |             <li>
108 |               <b>Input: </b>Provide the following:
109 |             </li>
110 |             <ul className="list-none ml-8">
111 |               <li>previous question</li>
112 |               <li>previous answer</li>
113 |               <li>current question</li>
114 |               <li>golden rewritten question</li>
115 |               <li>rewritten question</li>
116 |             </ul>
117 |             <li>
118 |               <b>Output: </b>The LLM Judge will output a Grade and Explanation.
119 |               A grade of 0 means the texts are dissimilar, while a grade of 1
120 |               means the texts are similar.
121 |             </li>
122 |           </ul>
123 |         </li>
124 |       </ol>
125 |       <Button variant="outlined" href="/pages/single">
126 |         Single Answer Evaluation
127 |       </Button>
128 |     </Box>
129 |   );
130 | }
131 | 
132 | export default SingleInstructions;
133 | 


--------------------------------------------------------------------------------
/REST-Service/app/src/services/ManagementService.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from typing import Any, Dict
 3 | from app.src.models.RequestHistory import RequestHistory
 4 | from app.src.models.Experiment import Experiment
 5 | from app.src.services.MongoService import MongoService
 6 | from bson.json_util import dumps, loads
 7 | from bson.objectid import ObjectId
 8 | 
 9 | class ManagementService:
10 | 
11 |     def __init__(self, mongo_db: MongoService) -> None:
12 |         self.experiment_collection = mongo_db.get_experiment_collection()
13 |         self.history_collection = mongo_db.get_request_history_collection()
14 | 
15 |     def get_experiments(self, user_id):
16 |         cursor = self.experiment_collection.find({ "user_id": user_id })
17 |         experiments = [self.bson_to_dict(doc) for doc in cursor]
18 |         return experiments
19 |     
20 |     def get_experiments_by_type(self, user_id: str, type: str):
21 |         cursor = self.experiment_collection.find({ "user_id": user_id, "type": type })
22 |         experiments = [self.bson_to_dict(doc) for doc in cursor]
23 |         return experiments
24 |     
25 |     def get_experiment_by_name(self, user_id: str, name: str):
26 |         cursor = self.experiment_collection.find_one({ "user_id": user_id, "name": name })
27 |         if cursor is not None:
28 |             return self.bson_to_dict(cursor)
29 |         return None
30 |     
31 |     def get_experiment_by_name_and_type(self, user_id: str, name: str, type: str):
32 |         cursor = self.experiment_collection.find_one({ "user_id": user_id, "name": name, "type": type })
33 |         if cursor is not None:
34 |           return self.bson_to_dict(cursor)
35 |         return None
36 |     
37 |     def get_history_by_id(self, user_id: str, doc_id: str):
38 |         object_id = ObjectId(doc_id)
39 |         cursor = self.history_collection.find_one({"user_id": user_id, "_id": object_id})
40 |         return self.bson_to_dict(cursor)
41 | 
42 |     def get_histories(self, user_id):
43 |         cursor = self.history_collection.find({ "user_id": user_id })
44 |         histories = [self.bson_to_dict(doc) for doc in cursor]
45 |         return histories
46 |     
47 |     def get_histories_by_type(self, user_id: str, type: str):
48 |         projection = {'content': 0}
49 |         cursor = self.history_collection.find({ "user_id": user_id, "type": type }, projection )
50 |         histories = [self.bson_to_dict(doc) for doc in cursor]
51 |         return histories
52 |     
53 |     def get_histories_by_experiment_name(self, user_id, experiment_name):
54 |         cursor = self.history_collection.find({ "user_id": user_id, "experiment_name": experiment_name })
55 |         histories = [self.bson_to_dict(doc) for doc in cursor]
56 |         return histories
57 |     
58 |     def get_histories_by_experiment_name_type(self, user_id: str, experiment_name: str, type: str):
59 |         query: dict = { "user_id": user_id, "experiment_name": experiment_name, "type": type }
60 |         cursor = self.history_collection.find(query)
61 |         print("calling here", query)
62 |         histories = [self.bson_to_dict(doc) for doc in cursor]
63 |         return histories
64 |     
65 |     def add_experiment(self, experiment: Experiment):
66 |         input = experiment.model_dump()
67 |         insertion = self.experiment_collection.insert_one(input)
68 |         return str(insertion.inserted_id)
69 |     
70 |     def add_history(self, request_history: RequestHistory) -> str:
71 |         input = request_history.model_dump()
72 |         insertion = self.history_collection.insert_one(input)
73 |         return str(insertion.inserted_id)
74 |     
75 |     def delete_experiment(self, doc_id: str, user_id):
76 |         object_id = ObjectId(doc_id)
77 |         result = self.experiment_collection.delete_one({"_id": object_id, "user_id": user_id})
78 |         return result.deleted_count
79 |     
80 |     def delete_experiment_by_name(self, experiment_name, user_id):
81 |         ## Delete all document under experiment name in request history collection
82 |         self.history_collection.delete_many({"experiment_name": experiment_name, "user_id": user_id})
83 |         ## Delete from experiment collections
84 |         result = self.experiment_collection.delete_one({"name": experiment_name, "user_id": user_id})
85 |         return result.deleted_count
86 |     
87 |     def delete_history(self, doc_id: str, user_id:str):
88 |         object_id = ObjectId(doc_id)
89 |         result = self.history_collection.delete_one({"_id": object_id, "user_id": user_id})
90 |         return result.deleted_count
91 | 
92 |     # Function to convert BSON document to a dictionary
93 |     def bson_to_dict(self, bson_doc) -> Dict[str, Any]:
94 |         # Convert ObjectId to string and return as dictionary
95 |         doc = bson_doc.copy()  # Create a copy to avoid modifying the original
96 |         doc['_id'] = str(doc['_id'])  # Convert ObjectId to string
97 |         return doc


--------------------------------------------------------------------------------
/JudgeIt-App/app/pages/single/doc/[doc_id]/page.js:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | import { useParams } from "next/navigation";
  3 | import { useSession } from "next-auth/react";
  4 | import { Grid, Box, Button, Typography, CircularProgress } from "@mui/material";
  5 | import EvaluationHistoryLeftBar from "@/components/judge/EvaluationHistoryLeftBar";
  6 | import { useEffect, useRef, useState } from "react";
  7 | import { fetch_request_history_by_id } from "@/services/ManagemenBackendAPI";
  8 | import {
  9 |   API_TYPE_SINGLETURN,
 10 |   API_TYPE_MULTITURN,
 11 |   API_TYPE_RATING,
 12 |   API_TYPE_SIMILARITY,
 13 | } from "@/services/Config";
 14 | import DisplayRequestHistoryRatingSimilarity from "@/components/judge/DisplayRequestHistoryRatingSimilarity";
 15 | import DisplayRequestHistorySingleTurn from "@/components/judge/DisplayRequestHistorySingleTurn";
 16 | import ArrowBackOutlinedIcon from "@mui/icons-material/ArrowBackOutlined";
 17 | import Footer from "@/components/globals/Footer";
 18 | import DisplayRequestHistoryMultiTurnConversation from "@/components/judge/DisplayRequestHistoryMultiTurn";
 19 | 
 20 | const ItemPage = () => {
 21 |   const params = useParams();
 22 |   const { data: session, status } = useSession();
 23 |   const hasEffectRun = useRef(false);
 24 |   const [serverData, setServerData] = useState(null);
 25 |   const { doc_id } = params; // Get the 'id' from the URL
 26 | 
 27 |   useEffect(() => {
 28 |     if (hasEffectRun.current) {
 29 |       return; // Prevents the effect from running again
 30 |     }
 31 | 
 32 |     const fetch_data = async () => {
 33 |       const data = await fetch_request_history_by_id(
 34 |         session.user.email,
 35 |         doc_id
 36 |       );
 37 |       setServerData(data);
 38 |     };
 39 | 
 40 |     if (session?.user.email) {
 41 |       fetch_data();
 42 |       hasEffectRun.current = true;
 43 |     }
 44 |   }, [session?.user.email, doc_id]); // Empty dependency array, runs only once
 45 | 
 46 |   if (status === "loading") {
 47 |     return (
 48 |       <div
 49 |         style={{
 50 |           display: "flex",
 51 |           justifyContent: "center",
 52 |           alignItems: "center",
 53 |           height: "100vh",
 54 |         }}
 55 |       >
 56 |         <CircularProgress />
 57 |       </div>
 58 |     );
 59 |   }
 60 | 
 61 |   return (
 62 |     <>
 63 |       <Box display={"flex"} flexDirection={"row"}>
 64 |         <Box display={"flex"} height={"100vh"} sx={{overflowY: 'auto'}}>
 65 |           <EvaluationHistoryLeftBar type={"single"} />
 66 |         </Box>
 67 |         <Box width={"100%"} height={"93vh"} overflow={"scroll"}>
 68 |           {session && serverData && (
 69 |             <Grid spacing={0} sx={{ flexGrow: 1 }} container>
 70 |               <Grid item xs={11}>
 71 |                 <Grid
 72 |                   marginTop={"30px"}
 73 |                   spacing={0}
 74 |                   sx={{ flexGrow: 1 }}
 75 |                   container
 76 |                 >
 77 |                   <Grid item xs={12}>
 78 |                     <Box
 79 |                       display={"flex"}
 80 |                       flexDirection={"row"}
 81 |                       justifyContent={"space-between"}
 82 |                     >
 83 |                       <Typography
 84 |                         style={{
 85 |                           fontSize: "30px",
 86 |                           marginLeft: "25px",
 87 |                           color: "#3B3B3B",
 88 |                           fontWeight: "bold",
 89 |                           marginBottom: "15px",
 90 |                         }}
 91 |                       >
 92 |                         Single Answer Evaluation: {serverData.name}
 93 |                       </Typography>
 94 |                       <Button
 95 |                         size="small"
 96 |                         href="/pages/single"
 97 |                         startIcon={<ArrowBackOutlinedIcon />}
 98 |                       >
 99 |                         Back
100 |                       </Button>
101 |                     </Box>
102 |                   </Grid>
103 |                   {(API_TYPE_RATING === serverData.eval_type ||
104 |                     API_TYPE_SIMILARITY === serverData.eval_type) && (
105 |                     <DisplayRequestHistoryRatingSimilarity
106 |                       serverData={serverData}
107 |                     />
108 |                   )}
109 |                   {API_TYPE_SINGLETURN === serverData.eval_type && (
110 |                     <DisplayRequestHistorySingleTurn serverData={serverData} />
111 |                   )}
112 |                   {API_TYPE_MULTITURN === serverData.eval_type && (
113 |                     <DisplayRequestHistoryMultiTurnConversation serverData={serverData} />
114 |                   )}
115 |                 </Grid>
116 |                 <Grid item xs={12} marginLeft={"25px"} marginTop={'50px'}>
117 |                   <Footer />
118 |                 </Grid>
119 |               </Grid>
120 |             </Grid>
121 |           )}
122 |           
123 |         </Box>
124 |       </Box>
125 |     </>
126 |   );
127 | };
128 | 
129 | export default ItemPage;
130 | 


--------------------------------------------------------------------------------
/JudgeIt-App/app/page.module.css:
--------------------------------------------------------------------------------
  1 | .main {
  2 |   display: flex;
  3 |   flex-direction: column;
  4 |   justify-content: space-between;
  5 |   align-items: center;
  6 | 
  7 | }
  8 | 
  9 | .description {
 10 |   display: inherit;
 11 |   justify-content: inherit;
 12 |   align-items: inherit;
 13 |   font-size: 0.85rem;
 14 |   max-width: var(--max-width);
 15 |   width: 100%;
 16 |   z-index: 2;
 17 |   font-family: var(--font-mono);
 18 | }
 19 | 
 20 | .description a {
 21 |   display: flex;
 22 |   justify-content: center;
 23 |   align-items: center;
 24 |   gap: 0.5rem;
 25 | }
 26 | 
 27 | .description p {
 28 |   position: relative;
 29 |   margin: 0;
 30 |   padding: 1rem;
 31 |   background-color: rgba(var(--callout-rgb), 0.5);
 32 |   border: 1px solid rgba(var(--callout-border-rgb), 0.3);
 33 |   border-radius: var(--border-radius);
 34 | }
 35 | 
 36 | .code {
 37 |   font-weight: 700;
 38 |   font-family: var(--font-mono);
 39 | }
 40 | 
 41 | .grid {
 42 |   display: grid;
 43 |   grid-template-columns: repeat(4, minmax(25%, auto));
 44 |   max-width: 100%;
 45 |   width: var(--max-width);
 46 | }
 47 | 
 48 | .card {
 49 |   padding: 1rem 1.2rem;
 50 |   border-radius: var(--border-radius);
 51 |   background: rgba(var(--card-rgb), 0);
 52 |   border: 1px solid rgba(var(--card-border-rgb), 0);
 53 |   transition: background 200ms, border 200ms;
 54 | }
 55 | 
 56 | .card span {
 57 |   display: inline-block;
 58 |   transition: transform 200ms;
 59 | }
 60 | 
 61 | .card h2 {
 62 |   font-weight: 600;
 63 |   margin-bottom: 0.7rem;
 64 | }
 65 | 
 66 | .card p {
 67 |   margin: 0;
 68 |   opacity: 0.6;
 69 |   font-size: 0.9rem;
 70 |   line-height: 1.5;
 71 |   max-width: 30ch;
 72 |   text-wrap: balance;
 73 | }
 74 | 
 75 | .center {
 76 |   display: flex;
 77 |   justify-content: center;
 78 |   align-items: center;
 79 |   position: relative;
 80 |   padding: 4rem 0;
 81 | }
 82 | 
 83 | .center::before {
 84 |   background: var(--secondary-glow);
 85 |   border-radius: 50%;
 86 |   width: 480px;
 87 |   height: 360px;
 88 |   margin-left: -400px;
 89 | }
 90 | 
 91 | .center::after {
 92 |   background: var(--primary-glow);
 93 |   width: 240px;
 94 |   height: 180px;
 95 |   z-index: -1;
 96 | }
 97 | 
 98 | .center::before,
 99 | .center::after {
100 |   content: "";
101 |   left: 50%;
102 |   position: absolute;
103 |   filter: blur(45px);
104 |   transform: translateZ(0);
105 | }
106 | 
107 | .logo {
108 |   position: relative;
109 | }
110 | /* Enable hover only on non-touch devices */
111 | @media (hover: hover) and (pointer: fine) {
112 |   .card:hover {
113 |     background: rgba(var(--card-rgb), 0.1);
114 |     border: 1px solid rgba(var(--card-border-rgb), 0.15);
115 |   }
116 | 
117 |   .card:hover span {
118 |     transform: translateX(4px);
119 |   }
120 | }
121 | 
122 | @media (prefers-reduced-motion) {
123 |   .card:hover span {
124 |     transform: none;
125 |   }
126 | }
127 | 
128 | /* Mobile */
129 | @media (max-width: 700px) {
130 |   .content {
131 |     padding: 4rem;
132 |   }
133 | 
134 |   .grid {
135 |     grid-template-columns: 1fr;
136 |     margin-bottom: 120px;
137 |     max-width: 320px;
138 |     text-align: center;
139 |   }
140 | 
141 |   .card {
142 |     padding: 1rem 2.5rem;
143 |   }
144 | 
145 |   .card h2 {
146 |     margin-bottom: 0.5rem;
147 |   }
148 | 
149 |   .center {
150 |     padding: 8rem 0 6rem;
151 |   }
152 | 
153 |   .center::before {
154 |     transform: none;
155 |     height: 300px;
156 |   }
157 | 
158 |   .description {
159 |     font-size: 0.8rem;
160 |   }
161 | 
162 |   .description a {
163 |     padding: 1rem;
164 |   }
165 | 
166 |   .description p,
167 |   .description div {
168 |     display: flex;
169 |     justify-content: center;
170 |     position: fixed;
171 |     width: 100%;
172 |   }
173 | 
174 |   .description p {
175 |     align-items: center;
176 |     inset: 0 0 auto;
177 |     padding: 2rem 1rem 1.4rem;
178 |     border-radius: 0;
179 |     border: none;
180 |     border-bottom: 1px solid rgba(var(--callout-border-rgb), 0.25);
181 |     background: linear-gradient(
182 |       to bottom,
183 |       rgba(var(--background-start-rgb), 1),
184 |       rgba(var(--callout-rgb), 0.5)
185 |     );
186 |     background-clip: padding-box;
187 |     backdrop-filter: blur(24px);
188 |   }
189 | 
190 |   .description div {
191 |     align-items: flex-end;
192 |     pointer-events: none;
193 |     inset: auto 0 0;
194 |     padding: 2rem;
195 |     height: 200px;
196 |     background: linear-gradient(
197 |       to bottom,
198 |       transparent 0%,
199 |       rgb(var(--background-end-rgb)) 40%
200 |     );
201 |     z-index: 1;
202 |   }
203 | }
204 | 
205 | /* Tablet and Smaller Desktop */
206 | @media (min-width: 701px) and (max-width: 1120px) {
207 |   .grid {
208 |     grid-template-columns: repeat(2, 50%);
209 |   }
210 | }
211 | 
212 | @media (prefers-color-scheme: dark) {
213 |   .vercelLogo {
214 |     filter: invert(1);
215 |   }
216 | 
217 |   .logo {
218 |     filter: invert(1) drop-shadow(0 0 0.3rem #ffffff70);
219 |   }
220 | }
221 | 
222 | @keyframes rotate {
223 |   from {
224 |     transform: rotate(360deg);
225 |   }
226 |   to {
227 |     transform: rotate(0deg);
228 |   }
229 | }
230 | 


--------------------------------------------------------------------------------
/JudgeIt-App/services/ManagemenBackendAPI.js:
--------------------------------------------------------------------------------
  1 | import axios from "axios";
  2 | import {
  3 |   LLM_JUDGE_API_KEY_SECRET,
  4 |   LLM_JUDGE_MANAGEMENT_API_URL,
  5 | } from "./Config";
  6 | 
  7 | export async function create_experiment(payload, type) {
  8 |   if (payload.experiment_option === "new_experiment") {
  9 |     const headers = {
 10 |       accept: "application/json",
 11 |       "user-id": payload.user_id,
 12 |       LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
 13 |       "Content-Type": "application/json",
 14 |     };
 15 | 
 16 |     const url = LLM_JUDGE_MANAGEMENT_API_URL + "experiment";
 17 | 
 18 |     const data = {
 19 |       name: payload.new_experiment,
 20 |       user_id: payload.user_id,
 21 |       type: type,
 22 |     };
 23 | 
 24 |     try {
 25 |       await axios.post(url, data, { headers });
 26 |     } catch (error) {
 27 |       throw error;
 28 |     }
 29 |   }
 30 | }
 31 | 
 32 | export const fetch_experiment_list_by_type = async (user_id, type) => {
 33 |   const url = LLM_JUDGE_MANAGEMENT_API_URL + "histories/type/" + type;
 34 | 
 35 |   const headers = {
 36 |     accept: "application/json",
 37 |     "user-id": user_id,
 38 |     LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
 39 |   };
 40 | 
 41 |   try {
 42 |     const response = await axios.get(url, { headers });
 43 |     const data = response.data;
 44 |     const groupedData = data.reduce((result, item) => {
 45 |       const { experiment_name } = item;
 46 | 
 47 |       // If the experiment_name doesn't exist in result, initialize it with an empty array
 48 |       if (!result[experiment_name]) {
 49 |         result[experiment_name] = [];
 50 |       }
 51 | 
 52 |       // Push the current item into the corresponding experiment_name array
 53 |       result[experiment_name].push(item);
 54 | 
 55 |       return result;
 56 |     }, {});
 57 | 
 58 |     return groupedData;
 59 |   } catch (error) {
 60 |     console.error("Error fetching data:", error); // Handle any errors
 61 |     throw error;
 62 |   }
 63 | };
 64 | 
 65 | export const fetch_request_history_by_id = async (user_id, doc_id) => {
 66 |   const url = LLM_JUDGE_MANAGEMENT_API_URL + "histories/" + doc_id;
 67 | 
 68 |   const headers = {
 69 |     accept: "application/json",
 70 |     "user-id": user_id,
 71 |     LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
 72 |   };
 73 | 
 74 |   try {
 75 |     const response = await axios.get(url, { headers });
 76 |     const data = response.data;
 77 |     return data;
 78 |   } catch (error) {
 79 |     console.error("Error fetching fetch_request_history_by_id :", error); // Handle any errors
 80 |     throw error;
 81 |   }
 82 | };
 83 | 
 84 | export const fetch_request_history_by_name_and_type = async (
 85 |   user_id,
 86 |   experiment_name,
 87 |   type
 88 | ) => {
 89 |   const url =
 90 |     LLM_JUDGE_MANAGEMENT_API_URL +
 91 |     "histories/name/" +
 92 |     experiment_name +
 93 |     "/type/" +
 94 |     type;
 95 | 
 96 |   const headers = {
 97 |     accept: "application/json",
 98 |     "user-id": user_id,
 99 |     LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
100 |   };
101 | 
102 |   try {
103 |     const response = await axios.get(url, { headers });
104 |     const data = response.data;
105 |     return data;
106 |   } catch (error) {
107 |     console.error(
108 |       "Error fetching fetch_request_history_by_name_and_type :",
109 |       error
110 |     ); // Handle any errors
111 |     throw error;
112 |   }
113 | };
114 | 
115 | export const get_experiment_list = async (user_id, type) => {
116 |   const url = LLM_JUDGE_MANAGEMENT_API_URL + "experiments/type/" + type;
117 | 
118 |   const headers = {
119 |     accept: "application/json",
120 |     "user-id": user_id,
121 |     LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
122 |   };
123 | 
124 |   try {
125 |     const response = await axios.get(url, { headers });
126 |     const data = response.data;
127 |     return data;
128 |   } catch (error) {
129 |     console.error("Error fetching get_experiment_list :", error); // Handle any errors
130 |     throw error;
131 |   }
132 | };
133 | 
134 | export const delete_history_by_id = async (history_id, user_id) => {
135 |   const headers = {
136 |     accept: "application/json",
137 |     "user-id": user_id,
138 |     LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
139 |     "Content-Type": "application/json",
140 |   };
141 | 
142 |   try {
143 |     const response = await axios.delete(
144 |       LLM_JUDGE_MANAGEMENT_API_URL + "history/" + history_id,
145 |       {
146 |         headers: headers,
147 |       }
148 |     );
149 |     return response.data;
150 |   } catch (error) {
151 |     console.error(
152 |       "Error:",
153 |       error.response ? error.response.data : error.message
154 |     );
155 |   }
156 | };
157 | 
158 | export const delete_history_by_experiment_name = async (
159 |   experiment_name,
160 |   user_id
161 | ) => {
162 |   const headers = {
163 |     accept: "application/json",
164 |     "user-id": user_id,
165 |     LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
166 |     "Content-Type": "application/json",
167 |   };
168 | 
169 |   try {
170 |     const response = await axios.delete(
171 |       LLM_JUDGE_MANAGEMENT_API_URL + "experiment/name/" + experiment_name,
172 |       {
173 |         headers: headers,
174 |       }
175 |     );
176 |     return response.data;
177 |   } catch (error) {
178 |     console.error(
179 |       "Error:",
180 |       error.response ? error.response.data : error.message
181 |     );
182 |   }
183 | };
184 | 


--------------------------------------------------------------------------------
/evaluationapp-readme.md:
--------------------------------------------------------------------------------
  1 | This addendum extends the original **JudgeIt Application** to support comprehensive evaluation of **agentic workflows** (e.g., SDR+ agents such as Comms, Research, Product, and Chrono).  
  2 | It introduces three complementary evaluation layers — **Blackbox**, **Whitebox**, and **Negative Testing** — to assess output quality, reasoning validity, and content safety.
  3 | 
  4 | ## Evaluation done with the following methods:
  5 | 1. Blackbox (Agent-level evaluation)
  6 | 2. Whitebox (Workflow-level evaluation)
  7 | 3. Negative testing
  8 | 
  9 | ### Blackbox Evaluation (Agent-Level)
 10 | **Goal:** Evaluate output quality and completeness of each SDR+ workflow agent — without inspecting internal logic.
 11 | 
 12 | Each agent output is compared against a ground truth reference, and assigned a normalized score based on content inclusion, factual accuracy, and task completion.
 13 | 
 14 | #### Score system:
 15 | RAG Based Agents: 0–1
 16 | 0: Output incomplete or factually inaccurate
 17 | 1: Output complete, accurate, and aligned with ground truth (included all details from relevant tools)
 18 | Multi-faceted Agents: 1–3
 19 | 1: Poor clarity, off-tone, or missing key elements, funnels in correct context from previous agentic processes
 20 | 2: Adequate but minor content issues
 21 | 3: Excellent — clear, accurate, well-structured, and included all details
 22 | 
 23 | #### Implementation focuses on the following questions:
 24 | Did the agent include all expected content from the reference?
 25 | Was the information factually correct and contextually relevant?
 26 | Was the task or objective fully completed as intended?
 27 | Was the structure and tone (for Comms) aligned with the workflow standards
 28 | 
 29 | ### Whitebox evaluation implementation
 30 | It essentially focuses on trace-based evaluation. Given a (user)query and agent thought trail which is generated by processing the user query, the implementation analyses the agent thought trail and returns a score of 0/1
 31 | 
 32 | #### Score system:
 33 | 0: agent thought trail has issues/ is not valid (not useful) <br/>
 34 | 1: agent thought trail is working correctly is valid (useful)
 35 | 
 36 | #### Implementation focuses on the following questions:
 37 | 1. Was the flow valid? i.e., followed logical needed steps like thought, tool usage, thought, final answer etc
 38 | 2. Were tools used by the agent?
 39 | 3. Were the right tools used?
 40 | 4. Errors seen
 41 | 
 42 | ### Negative testing evaluation
 43 | Makes use of watsonx.governance libraries to run HAP checks on all the content present - input and output of each agent
 44 | 
 45 | #### Score system:
 46 | 0: HAP/HARM/Unethical behavior etc content NOT found <br/>
 47 | 1: HAP/HARM/Unethical behavior etc content found
 48 | 
 49 | #### For cases where wx.gov cannot flag "negative" content: Developed LLM as Judge
 50 | #### LLM as Judge Score system: (same as above)
 51 | 0: content is clean<br/>
 52 | 1: "negative" content found
 53 | 
 54 | #### Implementation:
 55 | Inorder to run the integrated codebase locally, you need to build it slightly different since the wx.gov libraries don't work well on macs:
 56 | 
 57 | 1. Build the fastapi backend separately: `podman build --platform=linux/amd64 -t fastapi_app_image -f Dockerfile .`
 58 | 2. Replace docker-compose yaml with the given compose yaml and add env variables under 'environment' sections:
 59 | ```
 60 | services:
 61 |   fastapi_app:
 62 |     container_name: fastapi_app
 63 |     platform: linux/amd64
 64 |     image: fastapi_app_image
 65 |     #volumes:
 66 |     #  - ./app:/app
 67 |     ports:
 68 |       - 3001:3001
 69 |     environment:
 70 |       - WATSONX_URL=https://us-south.ml.cloud.ibm.com
 71 |       - WX_PROJECT_ID=
 72 |       - IBM_CLOUD_API_KEY=
 73 |       - LLM_JUDGE_API_KEY=
 74 |       - WX_PLATFORM=saas
 75 |       - WX_USER=''
 76 |       - WX_GOV_REGION=eu-de
 77 |       - CELERY_BROKER_URL=redis://redis:6379/0
 78 |       - CELERY_RESULT_BACKEND=redis://redis:6379/0
 79 |       - SERVER_URL=http://localhost:3001
 80 |     restart: always
 81 | 
 82 |   redis:
 83 |     container_name: redis
 84 |     image: redis:7.2.5-alpine
 85 |     restart: always
 86 | 
 87 |   celery_worker:
 88 |     container_name: celery_worker
 89 |     build: .
 90 |     #volumes:
 91 |     #  - ./app:/app
 92 |     command: celery -A app.celery.celery_worker.celery worker --loglevel=info
 93 |     environment:
 94 |       - WATSONX_URL=https://us-south.ml.cloud.ibm.com
 95 |       - WX_PROJECT_ID=
 96 |       - WX_PLATFORM=saas
 97 |       - WX_USER=''
 98 |       - WX_GOV_REGION=eu-de
 99 |       - IBM_CLOUD_API_KEY=
100 |       - CELERY_BROKER_URL=redis://redis:6379/0
101 |       - CELERY_RESULT_BACKEND=redis://redis:6379/0
102 |     depends_on:
103 |       - fastapi_app
104 |       - redis
105 |     restart: always
106 | 
107 |   flower:
108 |     container_name: flower
109 |     build: .
110 |     command: celery --broker=redis://redis:6379/0 flower --port=5555
111 |     ports:
112 |       - 5556:5555
113 |     environment:
114 |       - CELERY_BROKER_URL=redis://redis:6379/0
115 |       - CELERY_RESULT_BACKEND=redis://redis:6379/0
116 |     depends_on:
117 |       - fastapi_app
118 |       - redis
119 |       - celery_worker
120 |     restart: always
121 | ```
122 | 3. Run to build the rest of the services: `podman-compose build`
123 | 4. Run to get the services up and running: `podman-compose up -d`
124 | 5. Run to check if all 4 services are up and running: `podman-compose ps`
125 | 
126 | 


--------------------------------------------------------------------------------
/JudgeIt-App/services/Config.js:
--------------------------------------------------------------------------------
  1 | export const APP_VERSION = "Alpha-1.0 version";
  2 | //export const LLM_JUDGE_BASE_URL = "https://llm-judge-backend-llm-judge.roks-dsce2v-13d45cd84769aede38d625cd31842ee0-0000.us-south.containers.appdomain.cloud";
  3 | export const LLM_JUDGE_BASE_URL = "http://localhost:3001";
  4 | export const LLM_JUDGE_BATCH_EVENT_URL =
  5 |   LLM_JUDGE_BASE_URL + "/api/v1/judge/events/";
  6 | export const LLM_JUDGE_DOWNLOAD_EVALUATION_URL =
  7 |   LLM_JUDGE_BASE_URL + "/api/v1/judge/download/";
  8 | export const LLM_JUDGE_MANAGEMENT_API_URL =
  9 |   LLM_JUDGE_BASE_URL + "/api/v1/manage/";
 10 | 
 11 | export const API_TYPE_KEY = "apiType";
 12 | export const API_TYPE_RATING = "rating";
 13 | export const API_TYPE_SIMILARITY = "similarity";
 14 | export const API_TYPE_SINGLETURN = "singleturn";
 15 | export const API_TYPE_MULTITURN = "multiturn";
 16 | export const API_TYPE_WBOX_SDR = "whitebox_sdrflow";
 17 | export const API_TYPE_BBOX_SDR = "blackbox_sdrflow";
 18 | export const API_TYPE_AGENT = "agent_sdrflow";
 19 | 
 20 | export const LLM_JUDGE_API_KEY_SECRET = "JudgeIt-Secret-Api-Key";
 21 | 
 22 | export const LLM_MODELS = [
 23 |   /*
 24 |     {
 25 |         value: "MIXTRAL",
 26 |         label: "MIXTRAL"
 27 |     },
 28 |     {
 29 |         value: "GPT",
 30 |         label: "GPT"
 31 |     },
 32 |     */
 33 |   {
 34 |     value: "meta-llama/llama-3-3-70b-instruct",
 35 |     label: "llama-3-3-70b-instruct (Recommended)",
 36 |   },
 37 |   {
 38 |     value: "meta-llama/llama-3-3-70b-instruct",
 39 |     label: "llama-3-3-70b-instruct"
 40 |   },
 41 | ];
 42 | 
 43 | export const GITHUB_SOURCE_CODE =
 44 |   "https://github.com/ibm-ecosystem-engineering/JudgeIt-LLM-as-a-Judge";
 45 | export const GITHUB_REPORT_ISSUE =
 46 |   "https://github.com/ibm-ecosystem-engineering/JudgeIt-LLM-as-a-Judge/issues";
 47 | 
 48 | export const rag_similarity_display = [
 49 |   "Evaluate generated text against golden text and receive a binary score for similarity",
 50 |   "The LLM Judge will output a Grade and Explanation. A grade of 0 means the texts are dissimilar, while a grade of 1 means the texts are similar.",
 51 | ];
 52 | 
 53 | export const rag_rating_display = [
 54 |   "Evaluate generated text against golden text and receive a 1/2/3 rating based on degree of similarity",
 55 |   "The LLM Judge will output a Grade and Explanation. A grade of 1 means the texts are dissimilar, a grade of 2 means the texts are partially similar, and a text of 3 means the texts are significantly similar.",
 56 | ];
 57 | 
 58 | export const multi_turn_display = [
 59 |   "Evaluate rewritten queries given a mult-turn conversation and receive a binary score for similarity",
 60 |   "The LLM Judge will output a Grade. A grade of 0 means the golden rewritten question and rewritten question are dissimilar, while a grade of 1 means the questions are similar.",
 61 | ];
 62 | 
 63 | export const wbox_display = [
 64 |   "Evaluate generated agent thought trail and workflow execution on a 0/1 rating. 1 means the workflow is executing as expected; 0 means it does not.",
 65 |   "The LLM Judge will output a score.",
 66 | ];
 67 | 
 68 | export const bbox_display = [
 69 |   "Evaluate generated agent outputs against golden text. It evaluates Chrono, Product, and Research agents on 0/1 rating and Comms Agent on 1/2/3 rating based on degree of similarity",
 70 |   "The LLM Judge will output a Grade and Explanation. A grade of 1 means the texts are dissimilar, a grade of 2 means the texts are partially similar, and a text of 3 means the texts are significantly similar.",
 71 | ];
 72 | 
 73 | export const agent_display = [
 74 |   "Evaluate generated agent outputs for both black box (LLM-as-a-judge) as well as white box (workflow)",
 75 |   "The LLM Judge will output a set of grades for the different agents as well as for overall workflow.",
 76 | ];
 77 | 
 78 | export const grade_map_rating = {
 79 |   1: "Incorrect",
 80 |   2: "Partially correct",
 81 |   3: "Correct",
 82 | };
 83 | 
 84 | export const grade_map_similarity = {
 85 |   0: "Incorrect",
 86 |   1: "Correct",
 87 | };
 88 | 
 89 | export const grade_map_multiturn = {
 90 |   0: "Incorrect",
 91 |   1: "Correct",
 92 | };
 93 | 
 94 | 
 95 | export const app_labels_and_config = {
 96 |   app_version: "Alpha-1.0 version",
 97 |   app_title: "JudgeIt",
 98 |   app_subtitle: "LLM as a Judge",
 99 |   logo_text: "Ecosystem Engineering",
100 |   buttons: {
101 |     single_page_action: "Single answer evaluation",
102 |     batch_page_action: "Batch evaluation",
103 |   },
104 |   home_page_panel_title: {
105 |     similarity_panel: "RAG Evaluation (Similarity)",
106 |     rating_panel: "RAG Evaluation (Rating)",
107 |     multiturn_panel: "Multi-turn evaluation",
108 |     home_page_intro:
109 |       "JudgeIt is an automated evaluation framework designed for testing various Generative AI pipelines such as RAG, Multi-Turn Query Rewriting, Text-to-SQL, and more. This service utilizes an LLM Judge to accurately and efficiently evaluate generated text against provided golden text. Try evaluating a single input or a batch of inputs by clicking one of the options below!",
110 |   },
111 |   pages: {
112 |     batch_evaluation_page_title: "Batch Evaluation",
113 |     single_evaluation_page_title: "Single Answer Evaluation",
114 |     graph_title: "Grade Distribution",
115 |   },
116 |   github: "https://github.com/ibm-ecosystem-engineering/JudgeIt-LLM-as-a-Judge",
117 |   github_issues:
118 |     "https://github.com/ibm-ecosystem-engineering/JudgeIt-LLM-as-a-Judge/issues",
119 | };
120 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/BatchInstructions.jsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | import { Box, Typography, Button } from "@mui/material";
  3 | import BatchPredictionOutlinedIcon from "@mui/icons-material/BatchPredictionOutlined";
  4 | 
  5 | function BatchInstructions() {
  6 |   return (
  7 |     <Box sx={{ width: "95%", marginBottom: 2 }}>
  8 |       <Typography
  9 |         style={{
 10 |           fontSize: "30px",
 11 |           color: "#3B3B3B",
 12 |           fontWeight: "bold",
 13 |         }}
 14 |       >
 15 |         <BatchPredictionOutlinedIcon
 16 |           fontSize="30px"
 17 |           style={{ marginRight: "10px" }}
 18 |         />
 19 |         Batch Instructions
 20 |       </Typography>
 21 | 
 22 |       <Typography
 23 |         style={{
 24 |           fontSize: "16px",
 25 |           color: "#3B3B3B",
 26 |           margin: "10px",
 27 |         }}
 28 |       >
 29 |         Each type of LLM Judge will accept an excel/csv file as an input file.
 30 |         The{" "}
 31 |         <a
 32 |           href="https://github.com/ibm-ecosystem-engineering/JudgeIt-LLM-as-a-Judge/tree/main/Framework/data/input"
 33 |           target="_blank"
 34 |           rel="noopener noreferrer"
 35 |         >
 36 |           GitHub repository
 37 |         </a>{" "}
 38 |         for this app contains a sample input file for each type of LLM Judge
 39 |         that you can copy, edit, and use to test.
 40 |       </Typography>
 41 |       <ol className="list-decimal list-inside mb-4">
 42 |         <li className="mb-2">
 43 |           <Typography
 44 |             style={{
 45 |               fontSize: "16px",
 46 |               color: "#3B3B3B",
 47 |               margin: "10px",
 48 |               fontWeight: "bold",
 49 |             }}
 50 |           >
 51 |             RAG Evaluation (Similarity):
 52 |           </Typography>
 53 |           <ul className="list-disc list-inside ml-4">
 54 |             <li>
 55 |               <b>Function: </b>Compare a golden text to a generated text
 56 |             </li>
 57 |             <li>
 58 |               <b>Input: </b>Provide an excel/csv file with the following
 59 |               columns:
 60 |             </li>
 61 | 
 62 |             <ul className="list-none ml-8">
 63 |               <li>golden_text</li>
 64 |               <li>generated_text</li>
 65 |             </ul>
 66 |             <li>
 67 |               <b>Output: </b>The LLM Judge will output a Grade and Explanation.
 68 |               A grade of 0 means the texts are dissimilar, while a grade of 1
 69 |               means the texts are similar.
 70 |             </li>
 71 |           </ul>
 72 |         </li>
 73 | 
 74 |         <li className="mb-2">
 75 |           <Typography
 76 |             style={{
 77 |               fontSize: "16px",
 78 |               color: "#3B3B3B",
 79 |               margin: "10px",
 80 |               fontWeight: "bold",
 81 |             }}
 82 |           >
 83 |             RAG Evaluation (Rating):
 84 |           </Typography>
 85 |           <ul className="list-disc list-inside ml-4">
 86 |             <li>
 87 |               <b>Function: </b>Compare a golden text to a generated text
 88 |             </li>
 89 |             <li>
 90 |               <b>Input: </b>Provide an excel/csv file with the following
 91 |               columns:
 92 |             </li>
 93 |             <ul className="list-none ml-8">
 94 |               <li>golden_text</li>
 95 |               <li>generated_text</li>
 96 |             </ul>
 97 |             <li>
 98 |               <b>Output: </b>The LLM Judge will output a Grade and Explanation.
 99 |               A grade of 1 means the texts are dissimilar, a grade of 2 means
100 |               the texts are partially similar, and a text of 3 means the texts
101 |               are significantly similar
102 |             </li>
103 |           </ul>
104 |         </li>
105 | 
106 |         <li className="mb-2">
107 |           <Typography
108 |             style={{
109 |               fontSize: "16px",
110 |               color: "#3B3B3B",
111 |               margin: "10px",
112 |               fontWeight: "bold",
113 |             }}
114 |           >
115 |             Multi-turn Evaluation:
116 |           </Typography>
117 |           <ul className="list-disc list-inside ml-4">
118 |             <li>
119 |               <b>Function: </b>Compare a golden rewritten query to a rewritten
120 |               query based on a multi-turn conversation
121 |             </li>
122 |             <li>
123 |               <b>Input: </b>Provide an excel/csv file with the following
124 |               columns:
125 |             </li>
126 |             <ul className="list-none ml-8">
127 |               <li>previous_question</li>
128 |               <li>previous_answer</li>
129 |               <li>current_question</li>
130 |               <li>golden_rewritten_question</li>
131 |               <li>rewritten_question</li>
132 |             </ul>
133 |             <li>
134 |               <b>Output: </b>The LLM Judge will output a Grade and Explanation.
135 |               A grade of 0 means the texts are dissimilar, while a grade of 1
136 |               means the texts are similar.
137 |             </li>
138 |           </ul>
139 |         </li>
140 |       </ol>
141 | 
142 |       <Typography
143 |         style={{
144 |           fontSize: "16px",
145 |           color: "#3B3B3B",
146 |           margin: "10px",
147 |         }}
148 |       >
149 |         <b>Note:</b> Your input files can contain additional columns than the
150 |         ones specified above. These columns will have no effect on the LLM Judge
151 |         and will be preserved in the output file.
152 |       </Typography>
153 |       <Button variant="outlined" href="/pages/batch">
154 |         Batch Evaluation
155 |       </Button>
156 |     </Box>
157 |   );
158 | }
159 | 
160 | export default BatchInstructions;
161 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/globals/Topbar.jsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | import {
  3 |   Box,
  4 |   Typography,
  5 |   AppBar,
  6 |   Link,
  7 |   useMediaQuery,
  8 |   useTheme,
  9 |   Tooltip,
 10 |   IconButton,
 11 | } from "@mui/material";
 12 | import IBMIcon from "./icons/IBMIcon";
 13 | import MenuOutlinedIcon from "@mui/icons-material/MenuOutlined";
 14 | import DrawerMenu from "@/components/globals/DrawerMenu";
 15 | import { useState } from "react";
 16 | import { useSession } from "next-auth/react";
 17 | import GitHubIcon from "@mui/icons-material/GitHub";
 18 | import {
 19 |   app_labels_and_config,
 20 | } from "@/services/Config";
 21 | 
 22 | const Topbar = () => {
 23 |   const [drawerOpen, setDrawerOpen] = useState(false);
 24 |   const { data: session, status } = useSession();
 25 |   const theme = useTheme();
 26 |   const isSmallScreen = useMediaQuery(theme.breakpoints.down("sm"));
 27 |   const isMediumScreen = useMediaQuery(theme.breakpoints.between("sm", "md"));
 28 | 
 29 |   const getFontSize = () => {
 30 |     if (isSmallScreen) return "16px";
 31 |     if (isMediumScreen) return "18px";
 32 |     return "20px";
 33 |   };
 34 | 
 35 |   const handleDrawerOpen = () => {
 36 |     if (drawerOpen) setDrawerOpen(false);
 37 |     else setDrawerOpen(true);
 38 |   };
 39 | 
 40 |   const handleDrawerClose = (event) => {
 41 |     if (drawerOpen) setDrawerOpen(false);
 42 |   };
 43 | 
 44 |   return (
 45 |     <>
 46 |       {session && (
 47 |         <AppBar
 48 |           position="static"
 49 |           style={{
 50 |             backgroundColor: "#FFFFFF",
 51 |             height: "70px",
 52 |           }}
 53 |         >
 54 |           <Box
 55 |             display="flex"
 56 |             justifyContent="space-between"
 57 |             alignItems="center"
 58 |             height="100%"
 59 |             p={0}
 60 |             color={"#3B3B3B"}
 61 |             onClick={handleDrawerClose}
 62 |           >
 63 |             <Box
 64 |               display="flex"
 65 |               justifyContent="start"
 66 |               alignItems="center"
 67 |               height="100%"
 68 |               width="100%"
 69 |               sx={{ textDecoration: "none" }}
 70 |             >
 71 |               <Link
 72 |                 href="/"
 73 |                 sx={{
 74 |                   display: "flex",
 75 |                   alignItems: "center",
 76 |                   textDecoration: "none",
 77 |                 }}
 78 |               >
 79 |                 <IBMIcon />
 80 |                 <Typography
 81 |                   sx={{
 82 |                     fontSize: getFontSize(),
 83 |                     color: "#3B3B3B",
 84 |                     fontFamily: '"Source Sans Pro", sans-serif',
 85 |                     ml: 1,
 86 |                   }}
 87 |                 >
 88 |                   {app_labels_and_config.logo_text}
 89 |                 </Typography>
 90 |               </Link>
 91 |             </Box>
 92 |             <Box
 93 |               display={"flex"}
 94 |               flexDirection={"column"}
 95 |               textAlign={"center"}
 96 |               width={"100%"}
 97 |             >
 98 |               <Typography
 99 |                 fontWeight={"600"}
100 |                 fontSize={"1.8rem"}
101 |                 lineHeight={"2.5rem"}
102 |                 fontFamily={'"Source Sans Pro", sans-serif'}
103 |               >
104 |                 {app_labels_and_config.app_title}
105 |               </Typography>
106 |               <Typography
107 |                 variant="h7"
108 |                 fontFamily={'"Source Sans Pro", sans-serif'}
109 |               >
110 |                 {app_labels_and_config.app_subtitle}
111 |               </Typography>
112 |             </Box>
113 |             <Box
114 |               display="flex"
115 |               justifyContent="end"
116 |               alignItems="center"
117 |               width="100%"
118 |             >
119 |               <Typography
120 |                 style={{
121 |                   fontSize: "12px",
122 |                   color: "#3B3B3B",
123 |                   marginRight: "10px",
124 |                 }}
125 |               >
126 |                 Logged in as {session.user.email}
127 |               </Typography>
128 |               <Tooltip title="Source code">
129 |                 <IconButton href={app_labels_and_config.github} target="_blank">
130 |                   <GitHubIcon />
131 |                 </IconButton>
132 |               </Tooltip>
133 |               <Box
134 |                 display={"flex"}
135 |                 flexDirection={"column"}
136 |                 justifyContent={"end"}
137 |                 marginRight={"10px"}
138 |               >
139 |                 <Typography alignSelf={"end"} fontSize={"11px"}>
140 |                   {app_labels_and_config.app_version}
141 |                 </Typography>
142 |                 <Link
143 |                   href={app_labels_and_config.github_issues}
144 |                   underline="none"
145 |                   alignSelf={"end"}
146 |                   target="_blank"
147 |                   fontSize={"11px"}
148 |                 >
149 |                   Report an issue
150 |                 </Link>
151 |               </Box>
152 |               <MenuOutlinedIcon
153 |                 sx={{
154 |                   cursor: "pointer",
155 |                   color: "#3B3B3B",
156 |                   marginRight: "20px",
157 |                 }}
158 |                 fontSize="large"
159 |                 onClick={handleDrawerOpen}
160 |               />
161 |               <DrawerMenu
162 |                 open={drawerOpen}
163 |                 handleDrawerClose={handleDrawerClose}
164 |                 handleDrawerOpen={handleDrawerOpen}
165 |               />
166 |             </Box>
167 |           </Box>
168 |         </AppBar>
169 |       )}
170 |     </>
171 |   );
172 | };
173 | 
174 | export default Topbar;
175 | 


--------------------------------------------------------------------------------
/JudgeIt-App/components/judge/ExperimentForm.jsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | import React, { useState, useRef, useEffect } from "react";
  3 | import {
  4 |   TextField,
  5 |   Box,
  6 |   FormControlLabel,
  7 |   RadioGroup,
  8 |   FormHelperText,
  9 |   Radio,
 10 |   FormControl,
 11 |   InputLabel,
 12 |   Select,
 13 |   MenuItem,
 14 |   Tooltip,
 15 | } from "@mui/material";
 16 | import { get_experiment_list } from "@/services/ManagemenBackendAPI";
 17 | import { useSession } from "next-auth/react";
 18 | import { getRandomInt } from "@/utils/Helper";
 19 | import InfoOutlinedIcon from "@mui/icons-material/InfoOutlined";
 20 | 
 21 | const ExperimentForm = ({
 22 |   values,
 23 |   handleChange,
 24 |   handleBlur,
 25 |   errors,
 26 |   touched,
 27 |   type,
 28 |   created_experiment,
 29 | }) => {
 30 |   const [serverData, setServerData] = useState([]);
 31 |   const hasEffectRun = useRef(false);
 32 |   const { data: session, status } = useSession();
 33 | 
 34 |   useEffect(() => {
 35 |     if (hasEffectRun.current) {
 36 |       return; // Prevents the effect from running again
 37 |     }
 38 | 
 39 |     const fetch_data = async () => {
 40 |       const data = await get_experiment_list(session.user.email, type);
 41 |       setServerData(data);
 42 |     };
 43 | 
 44 |     if (session?.user.email) {
 45 |       fetch_data();
 46 |       hasEffectRun.current = true;
 47 |     }
 48 |   }, [session]); // Empty dependency array, runs only once
 49 | 
 50 |   useEffect(() => {
 51 |     if (created_experiment) {
 52 |       const newData = {
 53 |         name: created_experiment,
 54 |       };
 55 |       setServerData((prevData) => [...prevData, newData]);
 56 |     }
 57 |   }, [created_experiment]); // Trigger update when `result` changes
 58 | 
 59 |   return (
 60 |     <div>
 61 |       <Box
 62 |         marginBottom={"20px"}
 63 |         marginRight={"20px"}
 64 |         display={"flex"}
 65 |         flexDirection={"row"}
 66 |       >
 67 |         <FormControl
 68 |           component="fieldset"
 69 |           error={touched.experiment_option && Boolean(errors.experiment_option)}
 70 |         >
 71 |           <RadioGroup
 72 |             row
 73 |             aria-label="option"
 74 |             name={"experiment_option"}
 75 |             value={values.experiment_option}
 76 |             onChange={handleChange}
 77 |             onBlur={handleBlur}
 78 |           >
 79 |             <FormControlLabel
 80 |               value={"new_experiment"}
 81 |               control={<Radio />}
 82 |               label="New Experiment"
 83 |             />
 84 |             <FormControlLabel
 85 |               value={"existing_experiment"}
 86 |               control={<Radio />}
 87 |               label="Select An Existing Experiment"
 88 |             />
 89 |           </RadioGroup>
 90 |           {touched.experiment_option && errors.experiment_option && (
 91 |             <FormHelperText>{errors.experiment_option}</FormHelperText>
 92 |           )}
 93 |         </FormControl>
 94 |         <Tooltip
 95 |           title="Experiment keeps all your execution together and you can review and share it later. Select one of the options. If you want to create a new experiment and save your results under it, choose 'New Experiment.' Otherwise, select an 'Existing Experiment.'"
 96 |           sx={{ marginLeft: "5px", cursor: "help", marginTop: "8px" }}
 97 |         >
 98 |           <InfoOutlinedIcon />
 99 |         </Tooltip>
100 |       </Box>
101 |       {values.experiment_option === "new_experiment" && (
102 |         <Box marginBottom={"20px"} display={"flex"} flexDirection={"row"}>
103 |           <TextField
104 |             label="New experiment"
105 |             name="new_experiment"
106 |             value={values.new_experiment}
107 |             onChange={handleChange}
108 |             onBlur={handleBlur}
109 |             error={touched.new_experiment && Boolean(errors.new_experiment)}
110 |             helperText={touched.new_experiment && errors.new_experiment}
111 |             style={{ width: "100%" }}
112 |           />
113 |           <Tooltip
114 |             title="Enter the experiment name, you want to save your execution."
115 |             sx={{ marginLeft: "5px", cursor: "help" }}
116 |           >
117 |             <InfoOutlinedIcon />
118 |           </Tooltip>
119 |         </Box>
120 |       )}
121 |       {values.experiment_option === "existing_experiment" && (
122 |         <Box
123 |           marginBottom={"20px"}
124 |           marginRight={"20px"}
125 |           display={"flex"}
126 |           flexDirection={"row"}
127 |         >
128 |           <FormControl
129 |             error={
130 |               touched.existing_experiment && Boolean(errors.existing_experiment)
131 |             }
132 |           >
133 |             <InputLabel id="existing_experiment-label">Experiment</InputLabel>
134 |             <Select
135 |               sx={{ width: "200px" }}
136 |               labelId="existing_experiment-label"
137 |               id="existing_experiment"
138 |               name="existing_experiment"
139 |               value={values.existing_experiment}
140 |               onChange={handleChange}
141 |               onBlur={handleBlur}
142 |               label="Experiment"
143 |             >
144 |               {serverData.map((item, index) => (
145 |                 <MenuItem
146 |                   key={index + "-" + getRandomInt(100)}
147 |                   value={item.name}
148 |                 >
149 |                   {item.name}
150 |                 </MenuItem>
151 |               ))}
152 |             </Select>
153 |             {touched.existing_experiment && errors.existing_experiment && (
154 |               <FormHelperText>{errors.existing_experiment}</FormHelperText>
155 |             )}
156 |           </FormControl>
157 |           <Tooltip
158 |             title="Select the experiment name, you want to save your execution."
159 |             sx={{ marginLeft: "5px", cursor: "help", marginTop: "8px" }}
160 |           >
161 |             <InfoOutlinedIcon />
162 |           </Tooltip>
163 |         </Box>
164 |       )}
165 |     </div>
166 |   );
167 | };
168 | 
169 | export default ExperimentForm;
170 | 


--------------------------------------------------------------------------------
/JudgeIt-App/services/JudgeBackendAPISolo.js:
--------------------------------------------------------------------------------
  1 | import axios from "axios";
  2 | import {
  3 |   API_TYPE_MULTITURN,
  4 |   API_TYPE_RATING,
  5 |   API_TYPE_SIMILARITY,
  6 |   LLM_JUDGE_BASE_URL,
  7 |   LLM_JUDGE_API_KEY_SECRET,
  8 |   LLM_JUDGE_MANAGEMENT_API_URL,
  9 |   API_TYPE_SINGLETURN,
 10 | } from "./Config";
 11 | 
 12 | import { create_experiment } from "./ManagemenBackendAPI";
 13 | import { generateRandomString } from "@/utils/Helper";
 14 | 
 15 | /* SOLO API ENDPOINTS */
 16 | const API_RATING_URL                      = LLM_JUDGE_BASE_URL + "/api/v1/judge/rating";
 17 | const API_SIMLARITY_URL                   = LLM_JUDGE_BASE_URL + "/api/v1/judge/similarity";
 18 | const API_SINGLE_TURN_URL                 = LLM_JUDGE_BASE_URL + "/api/v1/judge/singleturn";
 19 | const API_MULTITURN_URL                   = LLM_JUDGE_BASE_URL + "/api/v1/judge/multiturn";
 20 | 
 21 | const config = {
 22 |   headers: {
 23 |     accept: "application/json",
 24 |     LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
 25 |     "Content-Type": "application/json",
 26 |   },
 27 | };
 28 | 
 29 | /** Single request call*/
 30 | export async function judge_api_solo_call(payload) {
 31 |   
 32 |   try {
 33 |     if (payload.apiType === API_TYPE_RATING) {
 34 |       return await rating_api_call(payload);
 35 |     } else if (payload.apiType === API_TYPE_SIMILARITY) {
 36 |       return await similarity_api_call(payload);
 37 |     } else if (payload.apiType === API_TYPE_SINGLETURN) {
 38 |       return await single_turn_api_call(payload);
 39 |     } else if (payload.apiType === API_TYPE_MULTITURN) {
 40 |       return await multiturn_conversation_api_call(payload);
 41 |     }else {
 42 |       throw "API not found";
 43 |     }
 44 |   } catch (error) {
 45 |     throw error;
 46 |   } finally {
 47 |   }
 48 | }
 49 | 
 50 | async function save_request_history(payload, result) {
 51 |   const headers = {
 52 |     accept: "application/json",
 53 |     "Content-Type": "application/json",
 54 |     "user-id": payload.user_id,
 55 |     LLM_JUDGE_API_KEY: LLM_JUDGE_API_KEY_SECRET,
 56 |   };
 57 | 
 58 |   const url = LLM_JUDGE_MANAGEMENT_API_URL + "history";
 59 | 
 60 |   const experiment_name =
 61 |     payload.experiment_option === "new_experiment"
 62 |       ? payload.new_experiment
 63 |       : payload.existing_experiment;
 64 | 
 65 |   let query = {};
 66 |   let name = payload.apiType + " - " + generateRandomString(4);
 67 | 
 68 |   if (payload.apiType === API_TYPE_SINGLETURN) {
 69 |     query = {
 70 |       model: payload.model,
 71 |       previous_question: payload.previous_question,
 72 |       previous_answer: payload.previous_answer,
 73 |       current_question: payload.current_question,
 74 |       golden_rewritten_question: payload.golden_rewritten_question,
 75 |       rewritten_question: payload.rewritten_question,
 76 |     };
 77 |   } else if (payload.apiType === API_TYPE_MULTITURN) {
 78 |     query = {
 79 |       model: payload.model,
 80 |       conversation_history: payload.conversation_history,
 81 |       follow_up_query: payload.follow_up_query,
 82 |       golden_query: payload.golden_query,
 83 |       rewritten_query: payload.rewritten_query
 84 |     };
 85 |   } else {
 86 |     query = {
 87 |       model: payload.model,
 88 |       question: payload.question,
 89 |       golden_text: payload.golden_text,
 90 |       generated_text: payload.generated_text,
 91 |     };
 92 |   }
 93 | 
 94 |   const content = {
 95 |     query: query,
 96 |     result: result,
 97 |   };
 98 |   const data = {
 99 |     name: name,
100 |     user_id: payload.user_id,
101 |     experiment_name: experiment_name,
102 |     content: content,
103 |     type: "single",
104 |     eval_type: payload.apiType,
105 |   };
106 | 
107 |   try {
108 |     const response = await axios.post(url, data, { headers });
109 |     data._id = response.data.insert_id;
110 |     return data;
111 |   } catch (error) {}
112 | }
113 | 
114 | async function rating_api_call(payload) {
115 |   try {
116 |     const response = await axios.post(API_RATING_URL, payload, config);
117 | 
118 |     // creating new experiment after a successful call
119 |     await create_experiment(payload, "single");
120 | 
121 |     // save the request
122 |     const savedObject = await save_request_history(payload, response.data);
123 | 
124 |     return {
125 |       query: savedObject,
126 |       data: response.data,
127 |     };
128 |   } catch (error) {
129 |     throw error;
130 |   }
131 | }
132 | 
133 | async function similarity_api_call(payload) {
134 |   try {
135 |     const response = await axios.post(API_SIMLARITY_URL, payload, config);
136 |     // creating new experiment after a successful call
137 |     await create_experiment(payload, "single");
138 | 
139 |     // save the request
140 |     const savedObject = await save_request_history(payload, response.data);
141 | 
142 |     return {
143 |       query: savedObject,
144 |       data: response.data,
145 |     };
146 |   } catch (error) {
147 |     throw error;
148 |   }
149 | }
150 | 
151 | async function single_turn_api_call(payload) {
152 |   try {
153 |     const response = await axios.post(API_SINGLE_TURN_URL, payload, config);
154 |     // creating new experiment after a successful call
155 |     await create_experiment(payload, "single");
156 | 
157 |     // save the request
158 |     const savedObject = await save_request_history(payload, response.data);
159 | 
160 |     return {
161 |       query: savedObject,
162 |       data: response.data,
163 |     };
164 |   } catch (error) {
165 |     throw error;
166 |   }
167 | }
168 | 
169 | async function multiturn_conversation_api_call(payload) {
170 |   try {
171 |     const response = await axios.post(API_MULTITURN_URL, payload, config);
172 |     // creating new experiment after a successful call
173 |     await create_experiment(payload, "single");
174 | 
175 |     console.log("#####################################################", payload);
176 | 
177 |     // save the request
178 |     const savedObject = await save_request_history(payload, response.data);
179 | 
180 |     return {
181 |       query: savedObject,
182 |       data: response.data,
183 |     };
184 |   } catch (error) {
185 |     throw error;
186 |   }
187 | }
188 | 


--------------------------------------------------------------------------------
/REST-Service/app/src/utils/Helper.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | from fastapi import UploadFile
  3 | import pandas as pd
  4 | 
  5 | class Helper:
  6 | 
  7 |     def __init__(self) -> None:
  8 |         pass
  9 | 
 10 |     def read_data(self, file_name: str, file_content: bytes) -> pd.DataFrame: 
 11 | 
 12 |         file_extension = file_name.split(".")[-1].lower()
 13 |         if file_extension not in ['xls', 'xlsx', 'csv']:
 14 |             raise Exception("Bad file types, accepted file types are xls, xlsx, and csv") 
 15 | 
 16 |         ## Read the data for btach processing 
 17 |         data_df = pd.DataFrame()
 18 |         file_stream = io.BytesIO(file_content)
 19 |         if '.xlsx' in file_name:
 20 |             data_df = pd.read_excel(file_stream)
 21 |         elif '.csv' in file_name:
 22 |             data_df =pd.read_csv(file_stream)
 23 |         return data_df
 24 |     
 25 | 
 26 |     def validate_single_turn_fields(self, data_df: pd.DataFrame):
 27 |         
 28 |         # Normalize the column names to lowercase
 29 |         data_df.columns = map(str.lower, data_df.columns)
 30 |         
 31 |         required_columns = ["previous_question", "previous_answer", "current_question", "golden_rewritten_question", "rewritten_question"]
 32 | 
 33 |         if all(column in data_df.columns for column in required_columns):
 34 |             return True
 35 |         
 36 |         columns = ", ".join(required_columns)
 37 | 
 38 |         raise Exception("Required columns are missing, valid columns are ## " + columns) 
 39 |     
 40 |     def validate_multi_turn_with_conversation_fields(self, data_df: pd.DataFrame):
 41 |         
 42 |         # Normalize the column names to lowercase
 43 |         data_df.columns = map(str.lower, data_df.columns)
 44 |         
 45 |         required_columns = ["conversation_history", "follow_up_query", "golden_query", "rewritten_query"]
 46 | 
 47 |         if all(column in data_df.columns for column in required_columns):
 48 |             return True
 49 |         
 50 |         columns = ", ".join(required_columns)
 51 | 
 52 |         raise Exception("Required columns are missing, valid columns are ## " + columns) 
 53 |     
 54 |     def validate_rating_and_similarity_fields(self, data_df: pd.DataFrame):
 55 |         # Normalize the column names to lowercase
 56 |         data_df.columns = map(str.lower, data_df.columns)
 57 | 
 58 |         # Define required columns in lowercase
 59 |         required_columns = ["question", "golden_text", "generated_text"]
 60 | 
 61 |         # Check if all required columns are present (case-insensitive)
 62 |         if all(column in data_df.columns for column in required_columns):
 63 |             return True
 64 | 
 65 |         columns = ", ".join(required_columns)
 66 | 
 67 |         raise Exception("Required columns are missing, valid columns are ## " + columns)
 68 | 
 69 | 
 70 |     def is_valid_file(file: UploadFile):
 71 |         filename = file.filename
 72 |         file_extension = filename.split(".")[-1].lower()
 73 |         
 74 |         if file_extension == 'csv' or file_extension in ['xls', 'xlsx']:
 75 |             return True
 76 |         else:
 77 |             return False
 78 | 
 79 | # This code was added to handle the case when the columns produced by langfuse script had lower case o in the
 80 | # Chrono Agent output field; This is not needed because we'll change all columns to title formant before sending 
 81 | # to whitebox eval       
 82 |     def validate_wbox_eval_fields(self, data_df: pd.DataFrame):
 83 | 
 84 |         ## Data provided has it as "Chrono Agent output" instead of "Chrono Agent Output" so made that change here..
 85 |         
 86 |         required_columns = ["Chrono Agent output", "Product Agent Output", "Research Agent Output", "Comms Agent Output"]
 87 | 
 88 |         if all(column in data_df.columns for column in required_columns):
 89 |             return True
 90 |         
 91 |         columns = ", ".join(required_columns)
 92 |  
 93 |         raise Exception("Required columns are missing, valid columns are ## " + columns)
 94 | 
 95 | 
 96 | 
 97 | #    def validate_wbox_eval_fields(self, data_df: pd.DataFrame):
 98 |         
 99 | #        required_columns = ["Chrono Agent Output", "Product Agent Output", "Research Agent Output", "Comms Agent Output"]
100 | 
101 | #        if all(column in data_df.columns for column in required_columns):
102 | #            return True
103 |         
104 | #        columns = ", ".join(required_columns)
105 | 
106 | #        raise Exception("Required columns are missing, valid columns are ## " + columns)
107 | 
108 |     def validate_bbox_eval_fields(self, data_df: pd.DataFrame):
109 |         
110 |         required_columns = ["Chrono Agent Output", "Product Agent Output", "Research Agent Output", "Comms Agent Output"]
111 | 
112 |         for col in required_columns:
113 |             colfound = col in data_df.columns
114 |             print(f"col {col} found: {colfound}")
115 | 
116 |         if all(column in data_df.columns for column in required_columns):
117 |             return True
118 |         
119 |         columns = ", ".join(required_columns)
120 | 
121 |         raise Exception("Required columns are missing, valid columns are ## " + columns)
122 | 
123 | 
124 |     def validate_neg_test_eval_fields(self, data_df: pd.DataFrame):
125 |         
126 |         required_columns = ["Research Agent Output", "Comms Agent Output"]
127 | 
128 | 
129 |         if all(column in data_df.columns for column in required_columns):
130 |             return True
131 |         
132 |         columns = ", ".join(required_columns)
133 | 
134 |         raise Exception("Required columns are missing, valid columns are ## " + columns) 
135 | 
136 | 
137 |     def validate_agent_eval_fields(self, data_df: pd.DataFrame):
138 |         
139 |         required_columns = ["Chrono Agent Output", "Product Agent Output", "Research Agent Output", "Comms Agent Output"]
140 | 
141 |         for col in required_columns:
142 |             colfound = col in data_df.columns
143 |             print(f"col {col} found: {colfound}")
144 | 
145 |         if all(column in data_df.columns for column in required_columns):
146 |             return True
147 |         
148 |         columns = ", ".join(required_columns)
149 | 
150 |         raise Exception("Required columns are missing, valid columns are ## " + columns)
151 | 


--------------------------------------------------------------------------------
/Framework/answer_similarity.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import configparser
  3 | from langchain_ibm import WatsonxLLM
  4 | from langchain_core.prompts import PromptTemplate
  5 | import sys
  6 | 
  7 | config = configparser.ConfigParser()
  8 | config.read('./config.ini')
  9 | 
 10 | ## Grading a generated text compared to a golden text
 11 | SIMILARITY_PROMPT= """Follow these structured steps to accurately assess the similarity between a Golden Text and a Generated Text:
 12 | 1. **Role and Task**: Assume the role of an impartial assistant and evaluator. Your task is to assess the similarity between a Golden Text and a Generated Text using the provided information.
 13 | 2. **Initial Setup**: Begin by carefully reviewing the Golden Text to understand the key information, entities, and intents it contains. The Golden Text is considered fully correct and comprehensive. Then, examine the Generated Text that needs evaluation.
 14 | 3. **Evaluation Criteria**: Evaluate the Generated Text based on the following criteria:
 15 |     - Output {{"Grade": "1"}} if:
 16 |       a) The Generated Text matches the Golden Text closely in terms of key entities and intents. Note that these may be worded differently but convey the same meaning contextually.
 17 |       b) The Generated Text contains all the essential information from the Golden Text, even if presented in a different order or with slight variations in phrasing.
 18 |       c) The Generated Text includes the core information from the Golden Text or  may contain additional relevant, concise details or expansions that don't contradict the contextual meaning of the Golden Text.
 19 |     - Output {{"Grade": "0"}} if:
 20 |       a) The Generated Text is missing critical entities or intents that are present in the Golden Text.
 21 |       b) The Generated Text contains significant factual errors or contradictions when compared to the Golden Text.
 22 |       c) The overall meaning or intent of the Generated Text substantially differs from the Golden Text.
 23 | 4. **Tolerance for Minor Differences**: Allow for minor differences in numerical values, slight variations in proper nouns, and small discrepancies in less critical details, as long as the core meaning and primary facts remain intact.
 24 | 5. **Explanation**: After providing the grade, explain your reasoning in 1 sentence, highlighting key similarities or differences that influenced your decision.
 25 | 6. **Output Format**: Format your evaluation output strictly as {{"Grade": "evaluated grade", "Explanation": "explanation for grade"}} to ensure clarity and consistency in assessment.
 26 | Remember, the goal is to identify substantive similarity rather than expecting word-for-word matches. Focus on the core information, key facts, and overall intent when making your assessment.
 27 | 
 28 | Input:
 29 | Golden Text: {prompt_parameter_1}
 30 | Generated Text: {prompt_parameter_2}
 31 | 
 32 | Output:
 33 | """
 34 | 
 35 | def batch_llm_answer_similarity(model_id, input_data):
 36 |     # watsonx.ai credentials for llm judge
 37 | 
 38 |     # instantiate wml connection
 39 |     wml_credentials = {
 40 |         "url": config['WML_CRED']['wml_url'],
 41 |         "apikey": config['WML_CRED']['api_key']
 42 |     }
 43 | 
 44 |     project_id = config['WML_CRED']['project_id']
 45 | 
 46 |     llm_model_id = model_id
 47 | 
 48 |     # llm parameters
 49 |     generate_parameters_1 = {
 50 |         "decoding_method": "greedy",
 51 |         "min_new_tokens": 1,
 52 |         "max_new_tokens": 200,
 53 |         "repetition_penalty": 1,
 54 |         "stop_sequences": ['}']
 55 |     }
 56 | 
 57 |     platform = config['WML_CRED']['wml_platform']
 58 |     if platform == "saas":
 59 |         # instatiate llm
 60 |         llm_model = WatsonxLLM(apikey=wml_credentials['apikey'],
 61 |                                 url=wml_credentials['url'],
 62 |                                 project_id=project_id,
 63 |                                 model_id=llm_model_id,
 64 |                                 params=generate_parameters_1)
 65 |     elif platform == "onpremise":
 66 |         wml_user = config['WML_CRED']['wml_user']
 67 |         llm_model = WatsonxLLM(apikey=wml_credentials['apikey'],
 68 |                             url=wml_credentials['url'],
 69 |                             model_id=llm_model_id,
 70 |                             username=wml_user,
 71 |                             instance_id='openshift',
 72 |                             project_id=project_id,
 73 |                             version="5.0",
 74 |                             params=generate_parameters_1)
 75 |     else:
 76 |         raise Exception("Please set a correct value in config.ini [WML_CRED][wml_platform], correct values are `onpremise` or `saas` ")
 77 |         
 78 |     input_data['Grade'] = None
 79 |     input_data['Explanation'] = None
 80 | 
 81 |     for index, row in input_data.iterrows():
 82 |         input_variables = ['prompt_parameter_1', 'prompt_parameter_2']
 83 |         prompt = PromptTemplate(input_variables=input_variables, template=SIMILARITY_PROMPT)
 84 |         llm_chain = prompt | llm_model
 85 |         # create invoke parameter which is a dictionary of your prompt parameters
 86 |         try:
 87 |             prompt_data = {'prompt_parameter_1': row['golden_text'],
 88 |                         'prompt_parameter_2': row['generated_text']}
 89 |         except KeyError as e:
 90 |             print(f"Error: Missing required column - {e}")
 91 |             print("Input file requires the following columns:")
 92 |             print("1) golden_text")
 93 |             print("2) generated_text")
 94 |         try:
 95 |             prompt_results = json.loads(llm_chain.invoke(prompt_data))
 96 |         except:
 97 |             prompt_results = 'Error generating results'
 98 |         
 99 |         if prompt_results == 'Error generating results':
100 |                 input_data.at[index,'Grade'] = 'Error'
101 |                 input_data.at[index,'Explanation'] = 'Error'
102 |         else:
103 |             input_data.at[index,'Grade'] = int(prompt_results['Grade'])
104 |             input_data.at[index,'Explanation'] = prompt_results['Explanation']
105 |         input_string = f"Golden Text: {prompt_data['prompt_parameter_1']}\n\nGenerated Text: {prompt_data['prompt_parameter_2']}"
106 |         print(f'-------------testing input {index + 1}-------------\n')
107 |         print(f'1) Input:\n\n{input_string}\n\n')
108 |         print(f'2) Output:\n\n{prompt_results}\n\n')
109 | 
110 |     return input_data
111 | 


--------------------------------------------------------------------------------