├── BE
├── .gitignore
├── .env
├── requirements.txt
├── app.py
└── chatbot
│ └── routes.py
├── FE
├── src
│ ├── constants
│ │ ├── index.ts
│ │ └── constants.ts
│ ├── vite-env.d.ts
│ ├── pages
│ │ ├── Landing.tsx
│ │ ├── Dashboard.tsx
│ │ └── Pages.tsx
│ ├── main.tsx
│ ├── index.css
│ ├── App.tsx
│ ├── routes.ts
│ ├── layouts
│ │ ├── Chatbot
│ │ │ ├── server.ts
│ │ │ ├── components
│ │ │ │ ├── Message.tsx
│ │ │ │ └── Input.tsx
│ │ │ ├── styles.ts
│ │ │ └── Chatbot.tsx
│ │ ├── Sidebar
│ │ │ ├── styles.ts
│ │ │ ├── components
│ │ │ │ └── History.tsx
│ │ │ └── Sidebar.tsx
│ │ ├── LandingBody
│ │ │ ├── styles.ts
│ │ │ └── LandingBody.tsx
│ │ ├── Footer
│ │ │ └── Footer.tsx
│ │ └── Header
│ │ │ ├── styles.ts
│ │ │ └── Header.tsx
│ ├── contexts
│ │ ├── usePersonalization.tsx
│ │ └── useIsDarkTheme.tsx
│ ├── theme
│ │ └── index.ts
│ └── data
│ │ └── data.ts
├── .env
├── public
│ ├── Jenkins_logo.ico
│ ├── 3-dots-fade.svg
│ └── Jenkins_logo.svg
├── vite.config.ts
├── tsconfig.node.json
├── .gitignore
├── index.html
├── .eslintrc.cjs
├── tsconfig.json
├── package.json
└── README.md
├── .gitattributes
├── images
├── llama.gif
├── llama2.png
└── llama2.webp
├── JenAi Final Document.docx
├── JenAi Final Document.pdf
├── datasets
├── raw
│ ├── Jenkins Docs.xlsx
│ └── QueryResults.csv
├── Jenkins Docs QA.csv
├── QueryResultsUpdated.csv
└── Community Questions Refined.csv
├── src
├── data preprocessing
│ ├── utils.py
│ ├── preprocessing.py
│ └── preprocessing.ipynb
└── data collection
│ ├── utils.py
│ ├── qa-article-to-qa-csv-pairs.ipynb
│ ├── parse-jenkins-community.ipynb
│ └── refine-html-tags.ipynb
├── LICENSE
├── .gitignore
└── README.md
/BE/.gitignore:
--------------------------------------------------------------------------------
1 | Llama-2-7b-chat-finetune/
2 |
--------------------------------------------------------------------------------
/FE/src/constants/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./constants";
2 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.csv filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/FE/src/vite-env.d.ts:
--------------------------------------------------------------------------------
1 | ///
11 | for tag in soup.find_all(True):
12 | if tag.name != 'code':
13 | tag.unwrap() # Remove the tag, keep its contents
14 |
15 | return str(soup)
16 |
17 |
--------------------------------------------------------------------------------
/FE/public/3-dots-fade.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/FE/src/index.css:
--------------------------------------------------------------------------------
1 | body {
2 | margin: 0;
3 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
4 | 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
5 | sans-serif;
6 | -webkit-font-smoothing: antialiased;
7 | -moz-osx-font-smoothing: grayscale;
8 | height: 100vh;
9 | }
10 |
11 | body > div {
12 | height: 100%;
13 | }
14 |
15 | code {
16 | font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
17 | monospace;
18 | }
19 |
20 |
--------------------------------------------------------------------------------
/FE/.eslintrc.cjs:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | root: true,
3 | env: { browser: true, es2020: true },
4 | extends: [
5 | 'eslint:recommended',
6 | 'plugin:@typescript-eslint/recommended',
7 | 'plugin:react-hooks/recommended',
8 | ],
9 | ignorePatterns: ['dist', '.eslintrc.cjs'],
10 | parser: '@typescript-eslint/parser',
11 | plugins: ['react-refresh'],
12 | rules: {
13 | 'react-refresh/only-export-components': [
14 | 'warn',
15 | { allowConstantExport: true },
16 | ],
17 | },
18 | }
19 |
--------------------------------------------------------------------------------
/FE/src/App.tsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import Pages from "./pages/Pages";
3 | import { IsDarkThemeProvider } from "./contexts/useIsDarkTheme";
4 | import { PersonalizationProvider } from "./contexts/usePersonalization";
5 |
6 | function App() {
7 | return (
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 | );
16 | }
17 |
18 | export default App;
19 |
--------------------------------------------------------------------------------
/FE/src/routes.ts:
--------------------------------------------------------------------------------
1 | import Dashboard from "./pages/Dashboard";
2 | import Landing from "./pages/Landing";
3 |
4 | export type RouteSchema = {
5 | name: string;
6 | key: string;
7 | route: string;
8 | component?: React.FunctionComponent;
9 | };
10 |
11 | export const routes: RouteSchema[] = [
12 | {
13 | name: "Dashboard",
14 | key: "dashboard",
15 | route: "/dashboard/:chatId?",
16 | component: Dashboard,
17 | },
18 | {
19 | name: "Landging",
20 | key: "landing",
21 | route: "/landing",
22 | component: Landing,
23 | },
24 | ];
25 |
--------------------------------------------------------------------------------
/FE/src/layouts/Chatbot/server.ts:
--------------------------------------------------------------------------------
1 | import axios from "axios";
2 |
3 | interface ApiResponse {
4 | prediction: string;
5 | }
6 |
7 | const chatEndpoint =
8 | import.meta.env.VITE_SERVER_URL + import.meta.env.VITE_CHAT_ENDPOINT;
9 |
10 | export const sendQuery = (query: string, personalization:string): Promise => {
11 | return axios
12 | .post(chatEndpoint, { text: query, persona: personalization })
13 | .then((response) => response.data.prediction)
14 | .catch((error) => {
15 | console.error("Error sending query:", error);
16 | throw error;
17 | });
18 | };
19 |
--------------------------------------------------------------------------------
/BE/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 | from chatbot.routes import chatbot_bp
3 | from flask_cors import CORS
4 | import os
5 | from dotenv import load_dotenv
6 |
7 | # Load environment variables from a .env file
8 | load_dotenv('.env')
9 |
10 | app = Flask(__name__)
11 |
12 | # Load configuration
13 | env = os.environ.get('FLASK_ENV', 'development')
14 |
15 | # Configure CORS
16 | CORS(app)
17 |
18 | # Register the blueprint with configurable URL prefix
19 | app.register_blueprint(chatbot_bp, url_prefix=os.getenv('CHATBOT_URL_PREFIX'))
20 |
21 | if __name__ == '__main__':
22 | app.run(host=os.getenv('FLASK_RUN_HOST'), port=int(os.getenv('FLASK_RUN_PORT')))
23 |
--------------------------------------------------------------------------------
/FE/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2020",
4 | "useDefineForClassFields": true,
5 | "lib": ["ES2020", "DOM", "DOM.Iterable"],
6 | "module": "ESNext",
7 | "skipLibCheck": true,
8 |
9 | /* Bundler mode */
10 | "moduleResolution": "bundler",
11 | "allowImportingTsExtensions": true,
12 | "resolveJsonModule": true,
13 | "isolatedModules": true,
14 | "noEmit": true,
15 | "jsx": "react-jsx",
16 |
17 | /* Linting */
18 | "strict": true,
19 | "noUnusedLocals": true,
20 | "noUnusedParameters": true,
21 | "noFallthroughCasesInSwitch": true
22 | },
23 | "include": ["src"],
24 | "references": [{ "path": "./tsconfig.node.json" }]
25 | }
26 |
--------------------------------------------------------------------------------
/FE/src/layouts/Sidebar/styles.ts:
--------------------------------------------------------------------------------
1 | import { ListItem } from "@mui/material";
2 | import { styled } from "@mui/material/styles";
3 |
4 | interface ListItemProps {
5 | disablePadding?: boolean;
6 | contained?: string;
7 | }
8 |
9 | export const CustomListItem = styled(ListItem)(
10 | ({ theme, contained }) => ({
11 | backgroundColor:
12 | contained === "true" ? theme.palette.grey[900] : "transparent",
13 | })
14 | );
15 |
16 | export const DrawerHeader = styled("div")(({ theme }) => ({
17 | display: "flex",
18 | alignItems: "center",
19 | padding: theme.spacing(0, 1),
20 | // necessary for content to be below app bar
21 | ...theme.mixins.toolbar,
22 | justifyContent: "space-between",
23 | }));
24 |
--------------------------------------------------------------------------------
/FE/src/pages/Dashboard.tsx:
--------------------------------------------------------------------------------
1 | import Sidebar from "../layouts/Sidebar/Sidebar";
2 | import Chatbot from "../layouts/Chatbot/Chatbot";
3 |
4 | import { Box } from "@mui/material";
5 | import { useState } from "react";
6 | import Header from "../layouts/Header/Header";
7 |
8 | const Dashboard = () => {
9 | const [open, setOpen] = useState(true);
10 |
11 | const handleDrawerOpen = () => {
12 | setOpen(true);
13 | };
14 |
15 | const handleDrawerClose = () => {
16 | setOpen(false);
17 | };
18 |
19 | return (
20 |
21 |
22 |
23 |
24 |
25 | );
26 | };
27 |
28 | export default Dashboard;
29 |
--------------------------------------------------------------------------------
/FE/src/layouts/LandingBody/styles.ts:
--------------------------------------------------------------------------------
1 | import { Box, Typography as MUITypography } from "@mui/material";
2 | import { styled } from "@mui/material/styles";
3 |
4 | type InputProps = {
5 | color?: string;
6 | };
7 |
8 | export const ButtonsWrapper = styled(Box)(() => ({
9 | display: "flex",
10 | alignItems: "center",
11 | justifyContent: "center",
12 | gap: "1rem",
13 | }));
14 |
15 | export const Container = styled(Box)(({ theme }) => ({
16 | height: "100vh",
17 | display: "flex",
18 | flexDirection: "column",
19 | alignItems: "center",
20 | justifyContent: "center",
21 | gap: "2rem",
22 | width: "60%",
23 | margin: "auto",
24 | textAlign: "center",
25 | color: theme.palette.text.primary,
26 | }));
27 |
28 | export const Typography = styled(MUITypography)(({ color }) => ({
29 | color,
30 | display: "inline",
31 | }));
32 |
--------------------------------------------------------------------------------
/FE/src/contexts/usePersonalization.tsx:
--------------------------------------------------------------------------------
1 | import { createContext, useContext, useState } from "react";
2 |
3 | type ProviderProps = {
4 | children: React.ReactNode;
5 | };
6 |
7 | type ContextValue = {
8 | text: string;
9 | handleUpdate: (text: string) => void;
10 | };
11 |
12 | const PersonalizationContext = createContext({
13 | text: "",
14 | handleUpdate: () => {},
15 | });
16 |
17 | export function PersonalizationProvider({ children }: ProviderProps) {
18 | const [text, setText] = useState("");
19 |
20 | const handleUpdate = (text: string) => {
21 | setText(text);
22 | };
23 |
24 | return (
25 |
26 | {children}
27 |
28 | );
29 | }
30 |
31 | export function usePersonalization() {
32 | return useContext(PersonalizationContext);
33 | }
34 |
--------------------------------------------------------------------------------
/FE/src/contexts/useIsDarkTheme.tsx:
--------------------------------------------------------------------------------
1 | import { createContext, useContext, useState } from "react";
2 |
3 | type ProviderProps = {
4 | children: React.ReactNode;
5 | };
6 |
7 | type ContextValue = {
8 | isDarkTheme: boolean;
9 | handleThemeChange: () => void;
10 | };
11 |
12 | const IsDarkThemeContext = createContext({
13 | isDarkTheme: false,
14 | handleThemeChange: () => {},
15 | });
16 |
17 | export function IsDarkThemeProvider({ children }: ProviderProps) {
18 | const [isDarkTheme, setIsDarkTheme] = useState(true);
19 |
20 | const handleThemeChange = () => {
21 | setIsDarkTheme(!isDarkTheme);
22 | };
23 |
24 | return (
25 |
26 | {children}
27 |
28 | );
29 | }
30 |
31 | export function useIsDarkTheme() {
32 | return useContext(IsDarkThemeContext);
33 | }
34 |
--------------------------------------------------------------------------------
/FE/src/layouts/Footer/Footer.tsx:
--------------------------------------------------------------------------------
1 | import { Box, Container, Grid, Typography, useTheme } from "@mui/material";
2 |
3 | const Footer = () => {
4 | const theme = useTheme();
5 |
6 | return (
7 |
8 |
9 |
10 |
11 |
16 | Jenkins Chatbot
17 |
18 |
19 |
20 |
21 | {`GSoC'24 | Jenkins | Enhancing LLM Project`}
22 |
23 |
24 |
25 |
26 |
27 | );
28 | };
29 |
30 | export default Footer;
31 |
--------------------------------------------------------------------------------
/FE/src/layouts/Header/styles.ts:
--------------------------------------------------------------------------------
1 | import { styled } from "@mui/material/styles";
2 | import MuiAppBar, { AppBarProps as MuiAppBarProps } from "@mui/material/AppBar";
3 |
4 | const drawerWidth = 240;
5 |
6 | interface AppBarProps extends MuiAppBarProps {
7 | open?: boolean;
8 | }
9 |
10 | export const AppBar = styled(MuiAppBar, {
11 | shouldForwardProp: (prop) => prop !== "open",
12 | })(({ theme, open }) => ({
13 | backgroundColor: theme.palette.secondary.main,
14 | paddingRight: 30,
15 | paddingLeft: 30,
16 | height: 80,
17 | justifyContent: "center",
18 | transition: theme.transitions.create(["margin", "width"], {
19 | easing: theme.transitions.easing.sharp,
20 | duration: theme.transitions.duration.leavingScreen,
21 | }),
22 | ...(open && {
23 | width: `calc(100% - ${drawerWidth}px)`,
24 | marginLeft: `${drawerWidth}px`,
25 | transition: theme.transitions.create(["margin", "width"], {
26 | easing: theme.transitions.easing.easeOut,
27 | duration: theme.transitions.duration.enteringScreen,
28 | }),
29 | }),
30 | }));
31 |
--------------------------------------------------------------------------------
/FE/src/theme/index.ts:
--------------------------------------------------------------------------------
1 | import { createTheme, responsiveFontSizes } from "@mui/material/styles";
2 |
3 | // Define light theme
4 | const lightTheme = responsiveFontSizes(
5 | createTheme({
6 | typography: {},
7 | palette: {
8 | mode: "light",
9 | primary: {
10 | main: "#bdbdbd",
11 | light: "#ecedf2",
12 | dark: "#676767",
13 | },
14 | secondary: {
15 | main: "#003b6d",
16 | light: "#669acc",
17 | },
18 | text: {
19 | primary: "#000000",
20 | secondary: "#ffffff",
21 | },
22 | },
23 | })
24 | );
25 |
26 | // Define dark theme
27 | const darkTheme = responsiveFontSizes(
28 | createTheme({
29 | typography: {},
30 | palette: {
31 | mode: "dark",
32 | primary: {
33 | main: "#f50057",
34 | dark: "#f50057",
35 | },
36 | secondary: {
37 | main: "#000000",
38 | },
39 | text: {
40 | primary: "#ffffff",
41 | secondary: "#000000",
42 | },
43 | },
44 | })
45 | );
46 |
47 | export { lightTheme, darkTheme };
48 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Nour Ziad Almulhem
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/FE/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "fe",
3 | "private": true,
4 | "version": "0.0.0",
5 | "type": "module",
6 | "scripts": {
7 | "dev": "vite",
8 | "build": "tsc && vite build",
9 | "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
10 | "preview": "vite preview"
11 | },
12 | "dependencies": {
13 | "@emotion/react": "^11.11.4",
14 | "@emotion/styled": "^11.11.5",
15 | "@mui/icons-material": "^5.15.20",
16 | "@mui/material": "^5.15.20",
17 | "axios": "^1.12.0",
18 | "install": "^0.13.0",
19 | "npm": "^10.8.1",
20 | "react": "^18.2.0",
21 | "react-dom": "^18.2.0",
22 | "react-router-dom": "^6.23.1",
23 | "styled-components": "^6.1.11"
24 | },
25 | "devDependencies": {
26 | "@types/node": "^20.14.11",
27 | "@types/react": "^18.2.66",
28 | "@types/react-dom": "^18.2.22",
29 | "@typescript-eslint/eslint-plugin": "^7.2.0",
30 | "@typescript-eslint/parser": "^7.2.0",
31 | "@vitejs/plugin-react": "^4.2.1",
32 | "eslint": "^8.57.0",
33 | "eslint-plugin-react-hooks": "^4.6.0",
34 | "eslint-plugin-react-refresh": "^0.4.6",
35 | "typescript": "^5.2.2",
36 | "vite": "^5.2.0"
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/FE/README.md:
--------------------------------------------------------------------------------
1 | # React + TypeScript + Vite
2 |
3 | This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
4 |
5 | Currently, two official plugins are available:
6 |
7 | - [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react/README.md) uses [Babel](https://babeljs.io/) for Fast Refresh
8 | - [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
9 |
10 | ## Expanding the ESLint configuration
11 |
12 | If you are developing a production application, we recommend updating the configuration to enable type aware lint rules:
13 |
14 | - Configure the top-level `parserOptions` property like this:
15 |
16 | ```js
17 | export default {
18 | // other rules...
19 | parserOptions: {
20 | ecmaVersion: 'latest',
21 | sourceType: 'module',
22 | project: ['./tsconfig.json', './tsconfig.node.json'],
23 | tsconfigRootDir: __dirname,
24 | },
25 | }
26 | ```
27 |
28 | - Replace `plugin:@typescript-eslint/recommended` to `plugin:@typescript-eslint/recommended-type-checked` or `plugin:@typescript-eslint/strict-type-checked`
29 | - Optionally add `plugin:@typescript-eslint/stylistic-type-checked`
30 | - Install [eslint-plugin-react](https://github.com/jsx-eslint/eslint-plugin-react) and add `plugin:react/recommended` & `plugin:react/jsx-runtime` to the `extends` list
31 |
--------------------------------------------------------------------------------
/FE/src/layouts/Chatbot/components/Message.tsx:
--------------------------------------------------------------------------------
1 | import { Box, Typography } from "@mui/material";
2 | import { MessageEntity } from "../Chatbot";
3 | import { AssistantAvatar, MsgWrapper, UserAvatar } from "../styles";
4 | import PersonIcon from "@mui/icons-material/Person";
5 |
6 | type MessageProps = {
7 | msg: MessageEntity;
8 | };
9 |
10 | const Message = (props: MessageProps) => {
11 | const { msg } = props;
12 | const isUser = msg.role === "User";
13 |
14 | const format_message = (message: string) => {
15 | console.log(message);
16 | const urlRegex = /]+>/g;
17 | return message
18 | .replace(urlRegex, (url) => {
19 | const cleanUrl = url.slice(1, -1);
20 | return `${cleanUrl}`;
21 | })
22 | .replace(/\n/g, "
");
23 | };
24 |
25 | return (
26 |
32 | {isUser && (
33 |
34 |
35 |
36 | )}
37 |
38 |
39 | {typeof msg.message != "string" ? (
40 | {msg.message}
41 | ) : (
42 |
45 | )}
46 |
47 |
48 | {!isUser && (
49 |
54 | )}
55 |
56 | );
57 | };
58 |
59 | export default Message;
60 |
--------------------------------------------------------------------------------
/FE/src/layouts/Sidebar/components/History.tsx:
--------------------------------------------------------------------------------
1 | import {
2 | ListItemText,
3 | ListItemIcon,
4 | ListItemButton,
5 | Divider,
6 | } from "@mui/material";
7 | import { useMemo } from "react";
8 | import List from "@mui/material/List";
9 | import ChatIcon from "@mui/icons-material/Chat";
10 | import { useNavigate, useParams } from "react-router-dom";
11 | import { CustomListItem } from "../styles";
12 | import { chatHistory } from "../../../data/data";
13 |
14 | const History = () => {
15 | const navigate = useNavigate();
16 | const { chatId } = useParams();
17 | const chatNumber = parseInt(chatId || "0");
18 |
19 | const historyElements = useMemo(
20 | () =>
21 | chatHistory.map((chat) => ({
22 | id: chat.id,
23 | text: chat.history[0].message.slice(0, 25) + "..",
24 | })),
25 | []
26 | );
27 |
28 | return (
29 | <>
30 | {historyElements && (
31 | <>
32 |
33 |
34 | {historyElements.map((item) => (
35 |
40 | navigate(`/dashboard/${item.id}`)}
42 | >
43 |
44 |
45 |
46 |
47 |
48 |
49 | ))}
50 |
51 | >
52 | )}
53 | >
54 | );
55 | };
56 |
57 | export default History;
58 |
--------------------------------------------------------------------------------
/FE/src/layouts/LandingBody/LandingBody.tsx:
--------------------------------------------------------------------------------
1 | import { Link } from "react-router-dom";
2 | import { ButtonsWrapper, Container, Typography } from "./styles";
3 |
4 | import { Button } from "@mui/material";
5 | import { useIsDarkTheme } from "../../contexts/useIsDarkTheme";
6 |
7 | const LandingBody = () => {
8 | const { isDarkTheme } = useIsDarkTheme();
9 | return (
10 |
11 |
12 | Welcome! to{" "}
13 |
18 | JenAi{" "}
19 |
20 |
21 |
22 | JenAi is your assistant today! JenAi was a GSoC'24 project by Jenkins.
23 | The project was to build a chatbot for specific Jenkins knowledge that
24 | can help users with their queries.
25 |
26 |
27 |
28 |
38 |
48 |
49 |
50 | );
51 | };
52 |
53 | export default LandingBody;
54 |
--------------------------------------------------------------------------------
/FE/src/layouts/Chatbot/components/Input.tsx:
--------------------------------------------------------------------------------
1 | import InputBase from "@mui/material/InputBase";
2 | import Divider from "@mui/material/Divider";
3 | import IconButton from "@mui/material/IconButton";
4 | import AddCircleOutlineIcon from "@mui/icons-material/AddCircleOutline";
5 | import SendIcon from "@mui/icons-material/Send";
6 | import { InputWrapper } from "../styles";
7 | import { useIsDarkTheme } from "../../../contexts/useIsDarkTheme";
8 |
9 | type InputProps = {
10 | setQuery: (query: string) => void;
11 | query: string;
12 | disabled: boolean;
13 | onSubmit: (e: React.FormEvent) => void;
14 | };
15 |
16 | const Input = (props: InputProps) => {
17 | const { setQuery, query, disabled, onSubmit } = props;
18 | const { isDarkTheme } = useIsDarkTheme();
19 |
20 | const handleKeyDown = (e: React.KeyboardEvent) => {
21 | if (e.key === "Enter") {
22 | e.preventDefault();
23 | onSubmit(e);
24 | }
25 | };
26 |
27 | return (
28 |
29 | setQuery(e.target.value)}
33 | onKeyDown={handleKeyDown}
34 | value={query}
35 | multiline
36 | fullWidth
37 | disabled={disabled}
38 | />
39 |
40 |
41 |
42 |
43 |
49 |
50 |
51 |
52 | );
53 | };
54 |
55 | export default Input;
56 |
--------------------------------------------------------------------------------
/FE/src/pages/Pages.tsx:
--------------------------------------------------------------------------------
1 | import { Routes, Route, Navigate } from "react-router-dom";
2 | import { CssBaseline, ThemeProvider } from "@mui/material";
3 | import { routes, RouteSchema } from "../routes";
4 | import { darkTheme, lightTheme } from "../theme";
5 | import { useIsDarkTheme } from "../contexts/useIsDarkTheme";
6 |
7 | import { createGlobalStyle } from "styled-components";
8 |
9 | const GlobalStyles = createGlobalStyle`
10 | /* width */
11 | ::-webkit-scrollbar {
12 | width: 8px;
13 | }
14 |
15 | /* Track */
16 | ::-webkit-scrollbar-track {
17 | box-shadow: inset 0 0 5px grey;
18 | border-radius: 10px;
19 | }
20 |
21 | /* Handle */
22 | ::-webkit-scrollbar-thumb {
23 | /* background: #f50057; */
24 | background: ${(props) => props.theme.palette.primary.main};
25 | border-radius: 10px;
26 | }
27 |
28 | /* Handle on hover */
29 | ::-webkit-scrollbar-thumb:hover {
30 | /* background: #f50057; */
31 | background: ${(props) => props.theme.palette.primary.main};
32 | }
33 | `;
34 |
35 | const Pages = () => {
36 | const { isDarkTheme } = useIsDarkTheme();
37 |
38 | const getRoutes = (allRoutes: RouteSchema[]) =>
39 | allRoutes.map((route: RouteSchema) => {
40 | if (route.route) {
41 | return (
42 |
47 | );
48 | }
49 |
50 | return null;
51 | });
52 |
53 | return (
54 |
55 |
56 |
57 |
58 | {getRoutes(routes)}
59 | } />
60 |
61 |
62 | );
63 | };
64 |
65 | export default Pages;
66 |
--------------------------------------------------------------------------------
/BE/chatbot/routes.py:
--------------------------------------------------------------------------------
1 | # chatbot/routes.py
2 | from flask import Blueprint, request, jsonify, Response, stream_with_context
3 | import logging
4 |
5 | from langchain_community.llms import CTransformers
6 | # from langchain.chains import LLMChain
7 | from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
8 | from langchain_core.prompts import PromptTemplate
9 |
10 | # Configure logging
11 | logging.basicConfig(filename='app.log', level=logging.ERROR,
12 | format='%(asctime)s %(levelname)s %(name)s %(message)s')
13 |
14 | chatbot_bp = Blueprint('chatbot', __name__)
15 |
16 | llm = CTransformers(model="nouralmulhem/Llama-2-7b-finetune-q8", model_file = 'model.bin', callbacks=[StreamingStdOutCallbackHandler()])
17 |
18 | template = """
19 | [INST] <>
20 | You are a helpful, respectful and honest assistant. Your answers are always very brief.
21 | {persona}
22 | Current conversation:
23 | {history}
24 | < >
25 | Human: {text}
26 | AI: [/INST]
27 | """
28 |
29 | prompt = PromptTemplate(template=template, input_variables=["text", "persona", "history"])
30 |
31 | # llm_chain = LLMChain(prompt=prompt, llm=llm)
32 | llm_chain = prompt | llm
33 |
34 |
35 | history = ''
36 |
37 | def process_request(query, persona):
38 | global history
39 | response = llm_chain.invoke({"text": query, "persona": persona, "history": history})
40 | history += f'Human: {query}\nAI: {response}\n'
41 | # print(f'{history}\n\n')
42 | return response
43 |
44 | @chatbot_bp.route('/chat', methods=['POST'])
45 | def chat():
46 | data = request.json
47 | query = data.get('text', '')
48 | persona = data.get('persona', '')
49 | try:
50 | prediction = process_request(query, persona)
51 | return jsonify({"prediction": prediction})
52 | except Exception as e:
53 | logging.error("Error occurred in /chat endpoint", exc_info=True)
54 | return jsonify({"prediction": "error"})
55 |
56 |
57 | @chatbot_bp.route('/test', methods=['GET'])
58 | def test():
59 | return jsonify({'test': 'OK'})
60 |
61 |
--------------------------------------------------------------------------------
/FE/src/layouts/Chatbot/styles.ts:
--------------------------------------------------------------------------------
1 | import { Avatar, Paper, styled } from "@mui/material";
2 | import { drawerWidth } from "../../constants";
3 | import { grey } from "@mui/material/colors";
4 |
5 | export const Container = styled(Paper)(({ theme }) => ({
6 | height: "100%",
7 | minWidth: "15%",
8 | width: "100%",
9 | margin: 30,
10 | backgroundColor: theme.palette.secondary.main,
11 | color: theme.palette.text.primary,
12 | display: "flex",
13 | flexDirection: "column",
14 | padding: 10,
15 | justifyContent: "center",
16 | alignItems: "center",
17 | gap: 20,
18 | }));
19 |
20 | export const Main = styled("main", {
21 | shouldForwardProp: (prop) => prop !== "open",
22 | })<{
23 | open?: boolean;
24 | }>(({ theme, open }) => ({
25 | flexGrow: 1,
26 | padding: theme.spacing(3),
27 | height: "85vh",
28 | width: "100%",
29 | transition: theme.transitions.create("margin", {
30 | easing: theme.transitions.easing.sharp,
31 | duration: theme.transitions.duration.leavingScreen,
32 | }),
33 | marginLeft: `-${drawerWidth}px`,
34 | ...(open && {
35 | transition: theme.transitions.create("margin", {
36 | easing: theme.transitions.easing.easeOut,
37 | duration: theme.transitions.duration.enteringScreen,
38 | }),
39 | marginLeft: 0,
40 | }),
41 | }));
42 |
43 | export const DrawerHeader = styled("div")(({ theme }) => ({
44 | display: "flex",
45 | alignItems: "center",
46 | padding: theme.spacing(0, 1),
47 | // necessary for content to be below app bar
48 | ...theme.mixins.toolbar,
49 | justifyContent: "flex-end",
50 | }));
51 |
52 | export const InputWrapper = styled(Paper)(() => ({
53 | padding: "2px 4px",
54 | display: "flex",
55 | alignItems: "center",
56 | width: "80%",
57 | }));
58 |
59 | export const MsgWrapper = styled(Paper)(({ theme }) => ({
60 | padding: 16,
61 | display: "flex",
62 | alignItems: "center",
63 | maxWidth: "70%",
64 | backgroundColor: theme.palette.mode === "dark" ? grey[900] : grey[300],
65 | }));
66 |
67 | export const UserAvatar = styled(Avatar)(({ theme }) => ({
68 | backgroundColor:
69 | theme.palette.mode === "dark"
70 | ? theme.palette.primary.main
71 | : theme.palette.secondary.light,
72 | }));
73 |
74 | export const AssistantAvatar = styled(Avatar)(() => ({
75 | height: 60,
76 | width: 40,
77 | }));
78 |
--------------------------------------------------------------------------------
/src/data preprocessing/preprocessing.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from datasets import Dataset, concatenate_datasets
3 |
4 | def preprocess_data(file_name, questions_col, answers_col, max_sequence_length=11000, num_samples=None):
5 | """
6 | Preprocess the data to the desired format
7 | Args:
8 | file_name (str): The name of the file to be preprocessed
9 | questions_col (str): The name of the column containing the questions
10 | answers_col (str): The name of the column containing the answers
11 | max_sequence_length (int): The maximum length of the sequence
12 | num_samples (int): The number of samples to select from the dataset
13 | Returns:
14 | dataset: The preprocessed dataset contains the text column
15 | """
16 | raw_data = pd.read_csv(file_name)
17 |
18 | # Define condition to drop rows where answers_col has a code tag in
19 | condition = raw_data[answers_col].str.contains('')
20 | raw_data = raw_data[~condition]
21 |
22 | # Define condition to drop rows where questions_col has a code tag in
23 | condition = raw_data[questions_col].str.contains('')
24 | raw_data = raw_data[~condition]
25 |
26 | # Combine columns into the desired format
27 | df = pd.DataFrame(columns=['text'])
28 |
29 | df['text'] = raw_data.apply(
30 | lambda row: f"[INST] {row[questions_col].strip()} [/INST] {row[answers_col]} ",
31 | axis=1
32 | )
33 |
34 | df['text'] = df['text'].str.replace('\n', ' ')
35 |
36 | # Calculate the length of each row
37 | df['row_length'] = df['text'].apply(len)
38 |
39 | # Filter out rows with length more than max_sequence_length (i.e. 11000) to obtain just acceptable length of input
40 | df_filtered = df[df['row_length'] <= max_sequence_length]
41 |
42 | # drop un needed columns
43 | df_filtered = df_filtered.drop(['row_length'], axis=1)
44 |
45 | # convert to dataset and select num_samples (i.e. 1000) record for now (considering memory resource)
46 | dataset = Dataset.from_pandas(df_filtered)
47 |
48 | if num_samples and len(dataset) >= num_samples:
49 | dataset = dataset.select(range(num_samples))
50 |
51 | return dataset
52 |
53 | if __name__ == "__main__":
54 | max_sequence_length = 11000
55 | num_samples = None
56 |
57 | file_name = '../../datasets/QueryResultsUpdated.csv'
58 | questions_col = 'Question Body'
59 | answers_col = 'Answer Body'
60 | dataset1 = preprocess_data(file_name, questions_col, answers_col, max_sequence_length, num_samples)
61 |
62 | file_name = '../../datasets/Jenkins Docs QA.csv'
63 | questions_col = 'Question'
64 | answers_col = 'Answer'
65 | dataset2 = preprocess_data(file_name, questions_col, answers_col, max_sequence_length, num_samples)
66 |
67 | file_name = '../../datasets/Community Questions Refined.csv'
68 | questions_col = 'questions'
69 | answers_col = 'answers'
70 | dataset3 = preprocess_data(file_name, questions_col, answers_col, max_sequence_length, num_samples)
71 |
72 | dataset = concatenate_datasets([dataset1, dataset2, dataset3])
73 | # dataset = dataset2
74 | dataset.to_csv('../../datasets/final/final2.csv')
75 | print(len(dataset))
--------------------------------------------------------------------------------
/src/data collection/qa-article-to-qa-csv-pairs.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import csv"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 21,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "name": "stdout",
20 | "output_type": "stream",
21 | "text": [
22 | "0 Q: What is Jenkins?\\nA: Jenkins is a self-cont...\n",
23 | "1 Q: What is the purpose of the tutorial \"Build ...\n",
24 | "2 **Q: Who is this LabVIEW CI/CD tutorial intend...\n",
25 | "3 Q: What is the concept behind using Jenkinsfil...\n",
26 | "4 Q: What does this tutorial demonstrate?\\nA: Th...\n",
27 | " ... \n",
28 | "91 Q: How can you refer to another project/job by...\n",
29 | "92 Q: What are the three flavors of remote access...\n",
30 | "93 Q: What is a Jenkins Agent?\\nA: A Jenkins Agen...\n",
31 | "94 Q: What is the importance of automating job de...\n",
32 | "95 Q: What is the purpose of file fingerprinting ...\n",
33 | "Name: 1, Length: 96, dtype: object\n"
34 | ]
35 | }
36 | ],
37 | "source": [
38 | "data = pd.read_excel(\"Jenkins Docs.xlsx\", header=None)\n",
39 | "\n",
40 | "second_column_values = data.iloc[:, 1]\n",
41 | "\n",
42 | "print(second_column_values)\n"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 53,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "output_file = '../datasets/Jenkins Docs QA.csv'\n",
52 | "\n",
53 | "with open(output_file, \"w\", newline=\"\") as csvfile:\n",
54 | " fieldnames = [\"Question\", \"Answer\"]\n",
55 | " writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
56 | " writer.writeheader()\n",
57 | "\n",
58 | "for idx, text in enumerate(second_column_values):\n",
59 | " # Split the text into individual QA pairs\n",
60 | " # qas = text.strip().split(\"A:\")\n",
61 | "\n",
62 | " # Write the QA pairs to a CSV file\n",
63 | " with open(output_file, \"a\", newline=\"\") as csvfile:\n",
64 | " writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
65 | "\n",
66 | " dict = text.strip().split(\"Q:\")\n",
67 | " # print(len(dict[1:]), dict[1:])\n",
68 | " for item in dict[1:]:\n",
69 | " extracted = item.strip().split(\"A:\")\n",
70 | " if len(extracted) < 2:\n",
71 | " print(idx)\n",
72 | " print(item)\n",
73 | " print(extracted)\n",
74 | " else:\n",
75 | " question, answer = extracted \n",
76 | " \n",
77 | " if question.startswith(\"Q:\"):\n",
78 | " question = question[2:]\n",
79 | " writer.writerow({\"Question\": question.strip(), \"Answer\": answer.strip()})\n"
80 | ]
81 | }
82 | ],
83 | "metadata": {
84 | "kernelspec": {
85 | "display_name": "base",
86 | "language": "python",
87 | "name": "python3"
88 | },
89 | "language_info": {
90 | "codemirror_mode": {
91 | "name": "ipython",
92 | "version": 3
93 | },
94 | "file_extension": ".py",
95 | "mimetype": "text/x-python",
96 | "name": "python",
97 | "nbconvert_exporter": "python",
98 | "pygments_lexer": "ipython3",
99 | "version": "3.9.12"
100 | }
101 | },
102 | "nbformat": 4,
103 | "nbformat_minor": 2
104 | }
105 |
--------------------------------------------------------------------------------
/FE/src/layouts/Sidebar/Sidebar.tsx:
--------------------------------------------------------------------------------
1 | import {
2 | ListItemText,
3 | ListItemIcon,
4 | ListItemButton,
5 | Divider,
6 | Drawer,
7 | IconButton,
8 | ListItem,
9 | Typography,
10 | Box,
11 | TextField,
12 | } from "@mui/material";
13 | import { ReactNode } from "react";
14 | import ChevronLeftIcon from "@mui/icons-material/ChevronLeft";
15 | import List from "@mui/material/List";
16 | import DeleteIcon from "@mui/icons-material/Delete";
17 | import AddCircleOutlineIcon from "@mui/icons-material/AddCircleOutline";
18 | import PhonelinkSetupIcon from "@mui/icons-material/PhonelinkSetup";
19 | import { drawerWidth } from "../../constants";
20 | import { useNavigate } from "react-router-dom";
21 | import { DrawerHeader } from "./styles";
22 | import { usePersonalization } from "../../contexts/usePersonalization";
23 |
24 | type ControllersElement = {
25 | id: number;
26 | text: string;
27 | icon: ReactNode;
28 | };
29 |
30 | const ControllersElements: ControllersElement[] = [
31 | {
32 | id: 1,
33 | text: "Start New Chat",
34 | icon: ,
35 | },
36 | {
37 | id: 2,
38 | text: "Delete Chats",
39 | icon: ,
40 | },
41 | ];
42 |
43 | type SidebarProps = {
44 | open: boolean;
45 | handleDrawerClose: () => void;
46 | };
47 |
48 | const Sidebar = (props: SidebarProps) => {
49 | const { open, handleDrawerClose } = props;
50 | const navigate = useNavigate();
51 | const { handleUpdate } = usePersonalization();
52 |
53 | const handler = (id: number) => {
54 | switch (id) {
55 | case 1:
56 | return () => {
57 | navigate("/dashboard");
58 | };
59 | case 2:
60 | return () => {
61 | // sethistoryElements(undefined);
62 | };
63 | }
64 | };
65 |
66 | return (
67 |
80 |
81 |
82 |
83 |
84 | JenAI
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 | {/* */}
93 |
94 | {
111 | handleUpdate(e.target.value);
112 | }}
113 | />
114 |
115 |
116 | {ControllersElements.map((item) => (
117 |
118 |
119 | {item.icon}
120 |
121 |
122 |
123 | ))}
124 |
125 |
126 | );
127 | };
128 |
129 | export default Sidebar;
130 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | # .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | Llama-2-7b-q8/
162 | llama/
163 |
164 | *.bin
--------------------------------------------------------------------------------
/FE/src/data/data.ts:
--------------------------------------------------------------------------------
1 | export const chatHistory = [
2 | {
3 | id: 1,
4 | history: [
5 | {
6 | role: "User",
7 | message: `
8 | I've looked all over for this answer but cannot find it, as most of the answers are how to add JDK to Jenkins for usage within a test. I'm looking for how to ensure Jenkins itself is using OpenJDK 11 that I have installed and not the older JDK 8.
9 | I have checked the Jenkins.xml and it does point to OpenJDK 11 as well as the %java_home% and PATH in the system environment are pointed to the correct locations.
10 | When I check System Properties, I still see things related to Java 8 (java.home, java.specification.version, etc.)
11 | I have no idea where else it could be stored at to change this.
12 | `,
13 | },
14 | {
15 | role: "Assistant",
16 | message: `In your Jenkins UI select Manage Jenkins → Status Information → System Information (displayed as System Properties then; in Jenkins 2.289.3, at least) → search for java.runtime.version or others to look for hints what's possibly going wrong.`,
17 | },
18 | ],
19 | },
20 | {
21 | id: 2,
22 | history: [
23 | {
24 | role: "User",
25 | message: `
26 | pre-commit is a great tool and id like to incorporate it into jenkins pipelines just to make sure someone didn't try to skip it during committing to the repo.
27 | so my questions are:
28 |
29 | what are the best practices to initialize pre-commit? (just initialize it as i would in the repo? the website offers examples for some other CI integrations but not jenkins)
30 | how can i say to jenkins to stop the build if any of the tests performed by pre-commit fails? (exit code? which one would be success or fail?
31 | `,
32 | },
33 | {
34 | role: "Assistant",
35 | message: `
36 | as the docs indicate the best way to run pre-commit in a CI system is with pre-commit run, usually pre-commit run --all-files --show-diff-on-failure
37 | this will exit nonzero which should fail your pipeline as desired
38 |
39 | disclaimer: I'm the creator of pre-commit
40 | `,
41 | },
42 | ],
43 | },
44 | {
45 | id: 3,
46 | history: [
47 | {
48 | role: "User",
49 | message: `
50 | what is Jenkins?
51 | `,
52 | },
53 | {
54 | role: "Assistant",
55 | message: `
56 | Jenkins is an open-source automation server widely used for continuous integration and continuous delivery (CI/CD). It helps automate the parts of software development related to building, testing, and deploying, facilitating continuous integration and continuous delivery.
57 |
58 | Here are some key features of Jenkins:
59 |
60 | 1. **Extensibility**: Jenkins has a large number of plugins available, which allows it to integrate with many other tools and services. This makes it highly customizable to suit various project needs.
61 |
62 | 2. **Distributed Builds**: Jenkins can distribute build/test loads to multiple machines, making it highly scalable.
63 |
64 | 3. **Easy Installation and Configuration**: Jenkins can be installed easily on various platforms and can be configured via its web interface.
65 |
66 | 4. **Pipeline as Code**: Jenkins supports defining pipelines as code using a domain-specific language (DSL) and storing them in a version control system (like Git). This is known as "Jenkins Pipeline" or "Pipeline as Code".
67 |
68 | 5. **Active Community**: Being open-source, Jenkins has a large, active community contributing to its development and providing support.
69 |
70 | 6. **Extensive Plugin Ecosystem**: Jenkins' functionality can be extended through plugins, which cover a wide range of purposes from source code management, build tools, and testing, to notifications and user interfaces.
71 |
72 | Jenkins is used by many organizations to automate their development processes, ensuring faster and more reliable software delivery.
73 | `,
74 | },
75 | ],
76 | },
77 | ];
78 |
--------------------------------------------------------------------------------
/FE/src/layouts/Chatbot/Chatbot.tsx:
--------------------------------------------------------------------------------
1 | import { useParams } from "react-router-dom";
2 | import { Container, DrawerHeader, Main } from "./styles";
3 | import { Box, useTheme } from "@mui/material";
4 | import { ReactNode, useEffect, useState } from "react";
5 |
6 | import { chatHistory } from "../../data/data";
7 | import Input from "./components/Input";
8 | import Message from "./components/Message";
9 | import { sendQuery } from "./server";
10 | import { usePersonalization } from "../../contexts/usePersonalization";
11 |
12 | type ChatbotProps = {
13 | open: boolean;
14 | };
15 |
16 | export type MessageEntity = {
17 | role: string;
18 | message: ReactNode;
19 | };
20 |
21 | const Chatbot = (props: ChatbotProps) => {
22 | const { open } = props;
23 | const { chatId } = useParams();
24 | const theme = useTheme();
25 | const chatNumber = parseInt(chatId || "0");
26 | const { text: personalization } = usePersonalization();
27 |
28 | const [conversation, setConversation] = useState([]);
29 | const [query, setQuery] = useState("");
30 | const [disabled, setDisabled] = useState(false);
31 | const [loading, setLoading] = useState(false);
32 |
33 | const onSubmit = (e: React.FormEvent) => {
34 | e.preventDefault();
35 | setDisabled(true);
36 |
37 | setConversation((prevConversation) => [
38 | ...prevConversation,
39 | { role: "User", message: query },
40 | ]);
41 | setQuery("");
42 | setLoading(true);
43 |
44 | sendQuery(query, personalization)
45 | .then((message) => {
46 | setTimeout(() => {
47 | setConversation((prevConversation) => [
48 | ...prevConversation,
49 | { role: "Assistant", message: message },
50 | ]);
51 | setDisabled(false);
52 | setLoading(false);
53 | }, 2000);
54 | })
55 | .catch((error) => {
56 | console.error("Error sending query:", error);
57 | setLoading(false);
58 | setDisabled(false);
59 | });
60 |
61 | // setTimeout(() => {
62 | // setConversation((prevConversation) => [
63 | // ...prevConversation,
64 | // { role: "Assistant", message: "Response" },
65 | // ]);
66 | // setLoading(false);
67 | // setDisabled(false);
68 | // }, 2000);
69 | };
70 |
71 | useEffect(() => {
72 | if (chatNumber >= 1) {
73 | setConversation(
74 | chatHistory.find((chat) => chat["id"] === chatNumber)?.history || []
75 | );
76 | } else {
77 | setConversation([]);
78 | }
79 | }, [chatNumber]);
80 |
81 | return (
82 |
83 |
84 |
85 |
86 |
96 | {conversation.map((msg, index) => (
97 |
98 | ))}
99 | {loading && (
100 |
114 | ),
115 | }}
116 | />
117 | )}
118 |
119 |
125 |
126 |
127 |
128 | );
129 | };
130 |
131 | export default Chatbot;
132 |
--------------------------------------------------------------------------------
/FE/src/layouts/Header/Header.tsx:
--------------------------------------------------------------------------------
1 | import {
2 | Box,
3 | Button,
4 | FormControlLabel,
5 | FormGroup,
6 | IconButton,
7 | styled,
8 | Switch,
9 | Toolbar,
10 | Typography,
11 | useTheme,
12 | } from "@mui/material";
13 | import { AppBar } from "./styles";
14 | import { Link, useLocation } from "react-router-dom";
15 | import MenuIcon from "@mui/icons-material/Menu";
16 | import { useMemo } from "react";
17 | import { useIsDarkTheme } from "../../contexts/useIsDarkTheme";
18 |
19 | type HeaderProps = {
20 | open?: boolean;
21 | handleDrawerOpen?: () => void;
22 | };
23 |
24 | interface MaterialUISwitchProps {
25 | isDarkTheme: boolean;
26 | }
27 |
28 | const MaterialUISwitch = styled(Switch)(
29 | ({ theme, isDarkTheme }) => ({
30 | width: 62,
31 | height: 34,
32 | padding: 7,
33 | "& .MuiSwitch-switchBase": {
34 | margin: 1,
35 | padding: 0,
36 | transform: "translateX(6px)",
37 | "&.Mui-checked": {
38 | color: "#fff",
39 | transform: "translateX(22px)",
40 | "& .MuiSwitch-thumb:before": {
41 | backgroundImage: `url('data:image/svg+xml;utf8,')`,
44 | },
45 | "& + .MuiSwitch-track": {
46 | opacity: 1,
47 | backgroundColor: isDarkTheme ? "#8796A5" : "#aab4be",
48 | },
49 | },
50 | },
51 | "& .MuiSwitch-thumb": {
52 | backgroundColor: isDarkTheme
53 | ? theme.palette.primary.main
54 | : theme.palette.primary.dark,
55 | width: 32,
56 | height: 32,
57 | "&::before": {
58 | content: "''",
59 | position: "absolute",
60 | width: "100%",
61 | height: "100%",
62 | left: 0,
63 | top: 0,
64 | backgroundRepeat: "no-repeat",
65 | backgroundPosition: "center",
66 | backgroundImage: `url('data:image/svg+xml;utf8,')`,
69 | },
70 | },
71 | "& .MuiSwitch-track": {
72 | opacity: 1,
73 | backgroundColor: isDarkTheme ? "#8796A5" : "#aab4be",
74 | borderRadius: 20 / 2,
75 | },
76 | })
77 | );
78 |
79 | const Header = (props: HeaderProps) => {
80 | const { open, handleDrawerOpen } = props;
81 | const theme = useTheme();
82 | const { pathname } = useLocation();
83 | const { isDarkTheme, handleThemeChange } = useIsDarkTheme();
84 |
85 | const isLandinPage = useMemo(() => pathname === "/landing", [pathname]);
86 |
87 | return (
88 |
89 |
90 |
91 |
92 | {!isLandinPage && (
93 |
99 |
100 |
101 | )}
102 |
110 |
111 | Jenkins Chatbot
112 |
113 |
114 |
115 |
124 |
137 |
138 |
141 | }
142 | label={isDarkTheme ? "Light Mode" : "Dark Mode"}
143 | onChange={handleThemeChange}
144 | sx={{ color: "white" }}
145 | />
146 |
147 |
148 |
149 |
150 |
151 | );
152 | };
153 |
154 | export default Header;
155 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Enhancing-LLM-with-Jenkins-Knowledge
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | ## Overview
18 |
19 |
20 | - Built using Python.
21 | - This Project is from Google Summer of code 2024
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 | ## Setting up env
34 |
35 | you may need to update the environment variables set in `BE/.env` and `FE/.env`
36 |
37 | ### Frontend env setup
38 |
39 | - reach to `.env` file in `FE/` directory, you will find the url setup by default to localhost
40 |
41 | ```sh
42 | VITE_SERVER_URL = http://127.0.0.1:5000/
43 | ```
44 |
45 | ### Backend env setup
46 |
47 | - reach to `.env` file in `BE/` directory, you will find also both `HOST` and `PORT` which are configured to localhost be default
48 |
49 | ```sh
50 | FLASK_RUN_HOST = 0.0.0.0
51 | FLASK_RUN_PORT = 5000
52 | ```
53 |
54 |
55 |
56 |
57 |
58 |
59 | ## How To Run
60 |
61 | Open a new terminal in the project directory
62 |
63 | ### Frontend server setup
64 |
65 | - Need to install [Node](https://nodejs.org/en/download/package-manager) first
66 | - Install all required packages
67 |
68 | ```sh
69 | cd ./FE
70 | npm install
71 | ```
72 | - Start the server
73 |
74 | ```sh
75 | npm run dev
76 | ```
77 | - You will get a message that the server is running at http://localhost:5173/
78 |
79 | ### Backend server setup
80 |
81 | - Install the needed packages.
82 |
83 | ```sh
84 | cd ./BE
85 | python3 -m venv .
86 | source ./bin/activate
87 | pip install -r ./requirements.txt
88 | ```
89 |
90 | - Start the server
91 |
92 | ```sh
93 | python app.py
94 | ```
95 |
96 | - note that if you are running the BE server for the first time so it will download the model locally on your machine and it is about 6GB, notice: this is for the first time you are running this only
97 |
98 |
99 |
100 |
101 |
102 | ## Fine-Tune your version
103 |
104 | You can fine-tune your own version and get it uploaded on hugging face using the following steps
105 |
106 | - we fine-tune llama2 using colab free resources of T4 GPU with 16 GB VRAM
107 | - we provided `./src/Fine-Tuning.ipynb`
108 | - we clone our repository to access the dataset provided for training
109 | ```sh
110 | git clone https://github.com/nouralmulhem/Enhancing-LLM-with-Jenkins-Knowledge.git
111 | ```
112 |
113 | - drive is used to store the checkpoints just to ensure its persistance in case of colab enviornment crashes
114 |
115 | you can edit the path to drive you want to save the model in by editting `new_model_path` variable
116 |
117 | - you also can set the number of epochs you would like to use to fine-tune the model by updating `num_train_epochs` variable
118 |
119 | - after getting done with fine-tuning the model you can access `./src/Upload_Model.ipynb` to merge lora weights with the model and upload your own model on hugging face and start using it
120 |
121 | - at this stage you need to update `new_model_path` variable to the correct path on your drive
122 |
123 | - as a final step you need to update `repo_id` variable to match your repo on hugging face
124 |
125 | VOILA! you got your own model
126 |
127 |
128 |
129 |
130 |
131 | ## Convert fine-tuned to GGML
132 |
133 | ### CPU model
134 |
135 | You can load this full model onto the GPU and run it like you would any other hugging face model, but we are here to take it to the next level of running this model on the CPU.
136 |
137 | we are using llama.cpp, so first of all we need to clone the repo
138 |
139 | ```sh
140 | git clone https://github.com/ggerganov/llama.cpp.git
141 | ```
142 |
143 | Llama.cpp has a script called `convert_hf_to_gguf.py` that is used to convert models to the binary GGML format that can be loaded and run on CPU.
144 |
145 | ```sh
146 | python convert_hf_to_gguf.py path/to/fine-tuned/model/ --outtype f16 --outfile path/to/binary/model.bin
147 | ```
148 |
149 | This should output a 13GB binary file at the specified `path/to/binary/model.bin` that is ready to run on CPU with the same code that we started with!
150 |
151 | ### Quantization
152 |
153 | Part of the appeal of the GGML library is being able to quantize this 13GB model into smaller models that can be run even faster. There is a tool called quantize in the Llama.cpp repo that can be used to convert the model to different quantization levels.
154 |
155 | First you need to build the tools in the Llama.cpp repository.
156 |
157 | ```sh
158 | cd llama.cpp
159 | cmake -B build
160 | cmake --build build --config Release
161 | ```
162 |
163 | This will create the tools in the bin directory. You can now use the quantize tool to shrink our model to q8_0 by running:
164 |
165 | ```sh
166 | cd build/bin/release
167 | ./llama-quantize.exe path/to/binary/model.bin path/to/binary/merged-q8_0.bin q8_0
168 | ```
169 | Now we have a 6.7 GB model at path/to/binary/merged-q8_0.bin
170 |
171 | To upload the local quantized model on huggingface
172 | ```sh
173 | huggingface-cli upload username/repo_id path/to/binary/quantized/model.bin model.bin
174 | ```
175 |
176 |
177 |
178 |
179 |
180 |
181 | ## Contributors
182 |
183 |
184 |
185 | 
Nour Almulhem
186 |
187 |
188 |
189 |
190 |
191 |
192 | ## 🔒 License
193 |
194 | > **Note**: This software is licensed under MIT License, See [License](https://github.com/nouralmulhem/Enhancing-LLM-with-Jenkins-Knowledge/blob/main/LICENSE) for more information ©nouralmulhem.
195 |
--------------------------------------------------------------------------------
/src/data collection/parse-jenkins-community.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 35,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import requests\n",
10 | "import numpy as np\n",
11 | "from utils import *"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 36,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "topics_list = []\n",
21 | "number_of_pages = 100\n",
22 | "URLs = [\n",
23 | " \"https://community.jenkins.io/c/using-jenkins/support/8.json\"\n",
24 | "]\n",
25 | "\n",
26 | "for page in URLs:\n",
27 | " \n",
28 | " for page_num in range(number_of_pages): \n",
29 | " \n",
30 | " # URL to make the GET request to\n",
31 | " url = f\"{page}?page={page_num}\"\n",
32 | "\n",
33 | " # Make the GET request\n",
34 | " response = requests.get(url)\n",
35 | "\n",
36 | " if response.status_code == 200:\n",
37 | " # Parse the JSON response into a Python dictionary\n",
38 | " json_response = response.json()\n",
39 | " \n",
40 | " \n",
41 | " # Extract posts from the JSON object\n",
42 | " topics = json_response[\"topic_list\"][\"topics\"]\n",
43 | " # print(page_num, len(topics))\n",
44 | " \n",
45 | " # Find the post with accepted_answer = true\n",
46 | " for topic in topics:\n",
47 | " if topic.get(\"has_accepted_answer\"):\n",
48 | " topics_list.append(topic)\n",
49 | "\n",
50 | " else:\n",
51 | " # Print an error message if the request was not successful\n",
52 | " print(\"Error:\", response.status_code)\n"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 37,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "name": "stdout",
62 | "output_type": "stream",
63 | "text": [
64 | "344\n"
65 | ]
66 | }
67 | ],
68 | "source": [
69 | "print(len(topics_list))"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 38,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "# Get the topics IDs\n",
79 | "topics_ids = [topic[\"id\"] for topic in topics_list]"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 39,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "No accepted answer found for topic 11272\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "accepted_answers = []\n",
97 | "original_questions = []\n",
98 | "\n",
99 | "for topic_id in topics_ids:\n",
100 | " \n",
101 | " # URL to make the GET request to\n",
102 | " url = f\"https://community.jenkins.io/t/{topic_id}.json?track_visit=true&forceLoad=true\"\n",
103 | "\n",
104 | " # Make the GET request\n",
105 | " response = requests.get(url)\n",
106 | "\n",
107 | " if response.status_code == 200:\n",
108 | " # Parse the JSON response into a Python dictionary\n",
109 | " json_response = response.json()\n",
110 | " \n",
111 | " \n",
112 | " # Extract posts from the JSON object\n",
113 | " posts = json_response[\"post_stream\"][\"posts\"]\n",
114 | " # print(posts)\n",
115 | " \n",
116 | " # Find the post with accepted_answer = true\n",
117 | " for post in posts:\n",
118 | " if post.get(\"accepted_answer\"):\n",
119 | " original_questions.append(posts[0]['cooked'])\n",
120 | " accepted_answers.append(post['cooked'])\n",
121 | " break\n",
122 | " else:\n",
123 | " print(f\"No accepted answer found for topic {topic_id}\")\n",
124 | "\n",
125 | " else:\n",
126 | " # Print an error message if the request was not successful\n",
127 | " print(\"Error:\", response.status_code)\n"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 40,
133 | "metadata": {},
134 | "outputs": [
135 | {
136 | "name": "stdout",
137 | "output_type": "stream",
138 | "text": [
139 | "343 343\n"
140 | ]
141 | }
142 | ],
143 | "source": [
144 | "print(len(original_questions), len(accepted_answers))"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 41,
150 | "metadata": {},
151 | "outputs": [
152 | {
153 | "name": "stdout",
154 | "output_type": "stream",
155 | "text": [
156 | "Jenkins Controller Version: 2.401.2 LTS
\n",
157 | "We have a Jenkins controller installed and configured using the official Jenkins Helm chart on an EKS cluster, with all resource provisioning managed by Terraform.
\n",
158 | "Currently, it is functioning well and is accessible at https://jenkins-utility.example.com/ using our LDAP credentials. All agent nodes connect to same jenkins endpoint.
\n",
159 | "However, when the Jenkins controller is restarted either via the URL using safe restart or due to the pod being killed/restarted, I can still access Jenkins using the same URL. Despite this, all agent nodes get disconnected because the Jenkins location URL resets to localhost:8080 instead of https://jenkins-utility.example.com/. Please see the attached image for reference.
\n",
160 | "Below is the Jenkins Configuration as Code (CasC) configuration we are using:
\n",
161 | "jenkins:\n",
162 | " authorizationStrategy:\n",
163 | " projectMatrix:\n",
164 | " entries:\n",
165 | " - group:\n",
166 | " name: \"authenticated\"\n",
167 | " permissions:\n",
168 | " - \"Agent/Configure\"\n",
169 | " - \"Agent/Connect\"\n",
170 | " - \"Agent/Create\"\n",
171 | " - \"Agent/Delete\"\n",
172 | " - \"Agent/Disconnect\"\n",
173 | " - \"Credentials/View\"\n",
174 | " - \"Job/Build\"\n",
175 | " - \"Job/Cancel\"\n",
176 | " - \"Job/Configure\"\n",
177 | " - \"Job/Create\"\n",
178 | " - \"Job/Delete\"\n",
179 | " - \"Job/Discover\"\n",
180 | " - \"Job/Move\"\n",
181 | " - \"Job/Read\"\n",
182 | " - \"Job/Workspace\"\n",
183 | " - \"Overall/Read\"\n",
184 | " - \"Overall/SystemRead\"\n",
185 | " - \"Run/Replay\"\n",
186 | " - \"View/Configure\"\n",
187 | " - \"View/Create\"\n",
188 | " - \"View/Delete\"\n",
189 | " - \"View/Read\"\n",
190 | " - group:\n",
191 | " name: \"jenkins-admins\"\n",
192 | " permissions:\n",
193 | " - \"Overall/Administer\"\n",
194 | " - user:\n",
195 | " name: \"anonymous\"\n",
196 | " permissions:\n",
197 | " - \"Job/Discover\"\n",
198 | " - \"Job/ViewStatus\"\n",
199 | " - \"Overall/Read\"\n",
200 | " - user:\n",
201 | " name: \"altif@example.com\"\n",
202 | " permissions:\n",
203 | " - \"Overall/Administer\"\n",
204 | " securityRealm:\n",
205 | " activeDirectory:\n",
206 | " bindPassword: \"xxxxxxxx\"\n",
207 | " cache:\n",
208 | " size: 1000\n",
209 | " ttl: 3600\n",
210 | " customDomain: true\n",
211 | " domains:\n",
212 | " - bindPassword: \"xxxxxxxx\"\n",
213 | " name: \"cor.example.com\"\n",
214 | " servers: \"ldap.example.com:3268\"\n",
215 | " tlsConfiguration: TRUST_ALL_CERTIFICATES\n",
216 | " groupLookupStrategy: TOKENGROUPS\n",
217 | " removeIrrelevantGroups: false\n",
218 | " requireTLS: false\n",
219 | " startTls: true\n",
220 | " disableRememberMe: false\n",
221 | " mode: NORMAL\n",
222 | " numExecutors: 2\n",
223 | " labelString: \"controller\"\n",
224 | " projectNamingStrategy: \"standard\"\n",
225 | " markupFormatter:\n",
226 | " plainText\n",
227 | " clouds:\n",
228 | " - kubernetes:\n",
229 | " containerCapStr: \"10\"\n",
230 | " defaultsProviderTemplate: \"\"\n",
231 | " connectTimeout: \"5\"\n",
232 | " readTimeout: \"15\"\n",
233 | " jenkinsUrl: \"http://jenkins-utility-prod.jenkins-utility-prod.svc.cluster.local:8080\"\n",
234 | " jenkinsTunnel: \"jenkins-utility-prod-agent.jenkins-utility-prod.svc.cluster.local:50000\"\n",
235 | " maxRequestsPerHostStr: \"32\"\n",
236 | " name: \"kubernetes\"\n",
237 | " namespace: \"jenkins-utility-prod\"\n",
238 | " serverUrl: \"https://kubernetes.default\"\n",
239 | " crumbIssuer:\n",
240 | " standard:\n",
241 | " excludeClientIPFromCrumb: false\n",
242 | "security:\n",
243 | " apiToken:\n",
244 | " creationOfLegacyTokenEnabled: false\n",
245 | " tokenGenerationOnCreationEnabled: false\n",
246 | " usageStatisticsEnabled: true\n",
247 | "unclassified:\n",
248 | " location:\n",
249 | " adminAddress: devops@example.com\n",
250 | " url: https://jenkins-utility.example.com/\n",
251 | "
\n",
252 | "I have spent many hours troubleshooting this issue but have not found a solution yet. Any assistance or references to resolve this issue would be greatly appreciated.
\n",
253 | "Thanks.
\n",
254 | "
Thank you @mawinter69 @poddingue for your prompt responses and willingness to assist, much appreciated!
\n",
255 | "Although, the initial suggestions didn’t resolve my issue, I continued my investigation. After exploring and digging JENKINS_HOME i.e. /var/jenkins_home, I noticed the init.groovy.d directory, I discovered groovy script named base.groovy that was resetting the Jenkins Location URL to localhost:8080. By modifying this script, I was able to set the correct base URL, and now everything works as expected.
\n",
256 | "Thank you all for your support!
\n"
257 | ]
258 | }
259 | ],
260 | "source": [
261 | "print(original_questions[0], accepted_answers[0])"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 33,
267 | "metadata": {},
268 | "outputs": [],
269 | "source": [
270 | "combined_arrays = np.column_stack((original_questions, accepted_answers))\n",
271 | "\n",
272 | "df = pd.DataFrame(combined_arrays, columns=['questions', 'answers'])\n",
273 | "\n",
274 | "df['questions'] = df['questions'].apply(remove_html_tags)\n",
275 | "df['answers'] = df['answers'].apply(remove_html_tags)"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 34,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "df.to_csv('../datasets/Community Questions Refined.csv', index=False, encoding='utf-8', date_format=str)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 18,
290 | "metadata": {},
291 | "outputs": [
292 | {
293 | "name": "stdout",
294 | "output_type": "stream",
295 | "text": [
296 | "CSV file with arrays saved as 'special_characters.csv'\n"
297 | ]
298 | }
299 | ],
300 | "source": [
301 | "# import csv\n",
302 | "\n",
303 | "# rows = list(zip(original_questions, accepted_answers))\n",
304 | "\n",
305 | "# with open('Community Questions.csv', 'w', newline='', encoding='utf-8') as csvfile:\n",
306 | "# writer = csv.writer(csvfile)\n",
307 | "# writer.writerow(['Question', 'Answer']) # Write header\n",
308 | " \n",
309 | "# for row in rows:\n",
310 | "# writer.writerow(row)\n",
311 | "\n",
312 | "# print(\"CSV file with arrays saved as 'special_characters.csv'\")\n"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 53,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "# import json\n",
322 | "# # Write posts data to a JSON file\n",
323 | "# with open(\"show.json\", \"w\") as json_file:\n",
324 | "# json.dump(posts, json_file, indent=4)"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 54,
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "# accepted_answers[0]"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {},
340 | "outputs": [],
341 | "source": []
342 | }
343 | ],
344 | "metadata": {
345 | "kernelspec": {
346 | "display_name": "base",
347 | "language": "python",
348 | "name": "python3"
349 | },
350 | "language_info": {
351 | "codemirror_mode": {
352 | "name": "ipython",
353 | "version": 3
354 | },
355 | "file_extension": ".py",
356 | "mimetype": "text/x-python",
357 | "name": "python",
358 | "nbconvert_exporter": "python",
359 | "pygments_lexer": "ipython3",
360 | "version": "3.11.3"
361 | }
362 | },
363 | "nbformat": 4,
364 | "nbformat_minor": 2
365 | }
366 |
--------------------------------------------------------------------------------
/src/data collection/refine-html-tags.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"JxCCtmYCKXuO"},"outputs":[],"source":["from utils import *"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":81},"executionInfo":{"elapsed":12,"status":"ok","timestamp":1717154418699,"user":{"displayName":"Nour Ziad Almulhem","userId":"13402313266978952785"},"user_tz":-180},"id":"SwPtcV_cINsT","outputId":"673b5589-94c5-453e-b477-0bd94213affc"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"summary":"{\n \"name\": \"df\",\n \"rows\": 341,\n \"fields\": [\n {\n \"column\": \"questions\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 341,\n \"samples\": [\n \"All my jobs are configured by Job DSL. I connect to bitbucket via Bitbucket Server Integration. Unfortunately, I cannot find how to set \\u201cBitbucket webhook trigger\\u201d option from Job DSL:
\\n
\\nDo you have any suggestions on how to do that?
\\nHere you can find example DSL code:
\\nmultibranchPipelineJob(\\\"Repo name\\\") {\\n branchSources {\\n branchSource {\\n source {\\n BbS {\\n id(\\\"all-branches\\\")\\n credentialsId(\\\"ci.user_password\\\")\\n sshCredentialsId(\\\"ci.user\\\")\\n serverId(\\\"XXXX\\\")\\n projectName(\\\"YYYYYY\\\")\\n repositoryName(repository)\\n mirrorName(\\\"\\\")\\n traits {\\n gitBranchDiscovery()\\n headWildcardFilter {\\n includes('*')\\n excludes(\\\"\\\")\\n }\\n }\\n }\\n }\\n }\\n }\\n\\n triggers {\\n //any trigger or something?\\n }\\n}\\n
\",\n \"Hi, I have created a docker image using this script:
\\nFROM centos:7\\n\\nRUN yum install -y \\\\\\n wget \\\\\\n git \\\\\\n make \\\\\\n tar \\\\\\n centos-release-scl \\\\\\n sudo\\n\\nRUN mkdir -p /root/.ssh && \\\\\\ncd /root/.ssh && \\\\\\nwget http://192.168.111.64/env/auto-keys.tar && \\\\\\ntar -xvf auto-keys.tar\\n\\nRUN cd /root && wget https://repo.anaconda.com/archive/Anaconda3-2020.02-Linux-x86_64.sh\\n\\nRUN cd /root && sudo chmod 777 Anaconda3-2020.02-Linux-x86_64.sh && \\\\\\nsudo ./Anaconda3-2020.02-Linux-x86_64.sh -b -p /root/anaconda3 && \\\\\\nsudo ln -fs /root/anaconda3/bin/python /usr/bin/python3 && \\\\\\nsudo ln -fs /root/anaconda3/bin/python /bin/python3 && \\\\\\nrm -fr Anaconda3-2020.02-Linux-x86_64.sh\\n\\nRUN cd /usr/local && \\\\\\nwget http://192.168.111.64/env/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz && \\\\\\ntar -xvf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz && \\\\\\nrm -rf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz\\n\\nRUN cd /usr/local && \\\\\\nwget http://192.168.111.64/env/gcc-arm-8.3-2019.03-x86_64-aarch64-elf.tar.xz && \\\\\\ntar -xvf gcc-arm-8.3-2019.03-x86_64-aarch64-elf.tar.xz && \\\\\\nrm -rf gcc-arm-8.3-2019.03-x86_64-aarch64-elf.tar.xz\\n\\nRUN yum install -y \\\\\\n sshpass \\\\\\n devtoolset-8\\n\\nRUN yum install -y openssl\\n\\nRUN echo 'source /opt/rh/devtoolset-8/enable' | sudo tee /etc/profile.d/gcc-version.sh && source /etc/profile.d/gcc-version.sh\\n# RUN rm -rf /usr/bin/gcc\\n\\n# RUN sudo ln -s /opt/rh/devtoolset-8/root/bin/gcc /usr/bin/gcc\\n\\nENV PATH /usr/local/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu/bin:${PATH}\\nENV PATH /usr/local/gcc-arm-8.3-2019.03-x86_64-aarch64-elf/bin:${PATH}\\n# ENV PATH /opt/rh/devtoolset-8/root/bin/gcc:${PATH}\\n\\n# RUN cd ~ && sed -i '$aexport PATH=$PATH:/usr/local/sbin:/usr/sbin' .bashrc && source .bashrc\\n
\\nThis is my pipeline:
\\nnode('192.168.111.134'){\\n def myImage\\n myImage = docker.image 'prbuild:v7'\\n stage('inside'){\\n \\n myImage.inside{\\n sh '''\\n cd /root/${project}\\n git pull\\n git branch\\n ./quince_x86-64_wout_build_b58R.sh\\n\\n '''\\n }\\n }\\n\\n}\\n
\\nThe error I came across
\\n10:45:10 /bin/sh: g++: command not found\\n10:45:10 /bin/sh: g++: command not found\\n10:45:10 /bin/sh: g++: command not found\\n10:45:10 /bin/sh: g++: command not found\\n10:45:10 /bin/sh: g++: command not found\\n10:45:10 /bin/sh: g++: command not found\\n10:45:10 /bin/sh: g++: command not found\\n10:45:10 /bin/sh: g++: command not found\\n10:45:10 /bin/sh: g++: command not found\\n
\\nThe question is I can run these scripts on my local container correctly but it show /bin/sh: g++: command not found when I am running the same script in Jenkins
\",\n \"Hi,
\\nmy shared library needs to parse the job config.xml to get the configured Jenkinsfile path of a multibranch pipeline.
\\nwith Jenkins 2.375.4.2 this works fine
\\ndef call() {\\n withCredentials([usernamePassword(credentialsId: 'xxx', passwordVariable: 'pw', usernameVariable: 'user')]) {\\n genericSh('curl -u ' + user + ':' + pw + \\\" ${JOB_URL + 'config.xml'} -O\\\")\\n }\\n jobconfig = readFile \\\"${env.WORKSPACE}/config.xml\\\" // otherwise 'prolog not allowed in context'\\n xml = new XmlParser().parseText(jobconfig)\\n jenkinsfilepath = xml.children()['scriptPath'].text()\\n getYamlRoot = new File(jenkinsfilepath).getParent()\\n readYaml file: \\\"${getYamlRoot ?: env.WORKSPACE}\\\" + '/pipeline.yaml'\\n}\\n
\\nbut with Jenkins 2.401.3.3 the curl command gives
\\n\\n\\n- \\n
TLSv1.3 (IN), TLS handshake, Newsession Ticket (4):
\\n \\n- \\n
Mark bundle as not supporting multiuse
\\n< HTTP/1.1 403 Forbidden
\\n< Date: Tue, 29 Aug 2023 15:19:51 GMT
\\n< X-Content-Type-Options: nosniff
\\n< X-You-Are-Authenticated-As: xxx
\\n< X-You-Are-In-Group-Disabled: JENKINS-39402: use -Dhudson.security.AccessDeniedException2.REPORT_GROUP_HEADERS=true or use /whoAmI to diagnose
\\n< X-Required-Permission: hudson.model.Hudson.Read
\\n< X-Permission-Implied-By: hudson.security.Permission.GenericRead
\\n< X-Permission-Implied-By: hudson.model.Hudson.Administer
\\n< Content-Type: text/html;charset=utf-8
\\n< Expires: Thu, 01 Jan 1970 00:00:00 GMT
\\n< Cache-Control: no-cache,no-store,must-revalidate
\\n< X-Hudson-Theme: default
\\n< Referrer-Policy: same-origin
\\n< Cross-Origin-Opener-Policy: same-origin
\\n< Set-Cookie: JSESSIONID.7b2b95ba=node01m0vlt3zqe4oz11xw57rl6gglh96.node0; Path=/; Secure; HttpOnly
\\n< X-Hudson: 1.395
\\n< X-Jenkins: 2.401.3.3
\\n< X-Jenkins-Session: b904a4f9
\\n< X-Frame-Options: ALLOW-FROM http://xxx:81
\\n< X-Instance-Identity: xxx
\\n< Content-Length: 23013
\\n< Server: Jetty(10.0.13)
\\n<
\\n<head resURL=\\\"/static/b904a4f9\\\" data-rooturl=\\\"\\\" data-resurl=\\\"/static/b904a4f9\\\" [...]\\n \\n
\\n
\\nIn browser the response is the same for JOB_URL/config.xml in both Jenkins versions.
\\nThat\\u2019s also what i get as response from curl with Jenkins 2.375.4.2
\\n<flow-definition plugin=\\\"workflow-job@1344.cb-v917b_4f08a_b_5c\\\">\\n <actions>\\n <org.jenkinsci.plugins.workflow.multibranch.JobPropertyTrackerAction plugin=\\\"workflow-multibranch@756.v891d88f2cd46\\\">\\n <jobPropertyDescriptors>\\n <string>jenkins.model.BuildDiscarderProperty</string>\\n </jobPropertyDescriptors>\\n </org.jenkinsci.plugins.workflow.multibranch.JobPropertyTrackerAction>\\n </actions>\\n <keepDependencies>false</keepDependencies>\\n <properties>\\n <org.jenkinsci.plugins.workflow.multibranch.BranchJobProperty plugin=\\\"workflow-multibranch@756.v891d88f2cd46\\\">\\n [...]\\n
\\nQuestions
\\nWhat\\u2019s the reason for the new behaviour ? Guess it\\u2019s some security related new feature, checked the change logs but didn\\u2019t find something helpful.
\\nIs there maybe a better / another way to get the Jenkinsfile path ? Searched a lot, but it seems there\\u2019s no other way than $JOB_URL/config.xml
\\nGilbert
\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answers\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 341,\n \"samples\": [\n \"\\nThat\\u2019s what I need!
\\nThanks to this I found the solution:
\\n triggers {\\n BitbucketWebhookMultibranchTrigger {\\n refTrigger(true)\\n pullRequestTrigger(false)\\n }\\n }\\n
\\n@halkeys big thanks!
\",\n \"The problem probably your PATH setting via ENV ( it will not do what you expect\\u2026)
\\nI would create a (sh)-script for setting up the environment and call it from the jenkins job.
\",\n \"I\\u2019m not aware of changes here. More likely that you have an issue with permissions or your password/credential.
\\nYou should be able to get the scriptpath with groovy code like this (not tested). Though this will work only when you\\u2019re not in a sandbox. But within a shared library it works I guess.
\\n@NonCPS\\ndef getScriptPath() {\\n job = build.getParent()\\n definition = job.getDefinition()\\n if (definition instanceof org.jenkinsci.plugins.workflow.cps.CpsScmFlowDefinition) {\\n return definition.scriptPath\\n }\\n return null\\n}\\n
\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}","type":"dataframe","variable_name":"df"},"text/html":["\n"," \n"," \n","\n","\n"," \n"," \n"," \n"," questions \n"," answers \n"," \n"," \n"," \n"," \n"," 0 \n"," <p>I use it to run autotests using python (pyt... \n"," <p>I would assume that the 2 GiB for the contr... \n"," \n"," \n","
\n","\n"," \n"," \n"],"text/plain":[" questions \\\n","0 I use it to run autotests using python (pyt... \n","\n"," answers \n","0
I would assume that the 2 GiB for the contr... "]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["file_path = '../datasets/raw/QueryResults.csv'\n","\n","df = pd.read_csv(file_path)\n","\n","df.head(1)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"c4hpJk1MQLVb"},"outputs":[],"source":["df_updated = df.copy()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"SpYY2AIBKCac"},"outputs":[],"source":["df_updated['Question Body'] = df_updated['Question Body'].apply(remove_html_tags)\n","df_updated['Answer Body'] = df_updated['Answer Body'].apply(remove_html_tags)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Zn1jzfkLQFgP"},"outputs":[],"source":["# idx = 7\n","\n","# print(df['Question Body'][idx], df_updated['Question Body'][idx])"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"MntB0fEJKi5s"},"outputs":[],"source":["df_updated.to_csv('../datasets/QueryResultsUpdated.csv', index=False)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"VIAsqdZjc5Do"},"outputs":[],"source":[]}],"metadata":{"colab":{"provenance":[{"file_id":"1DL4jU9yDKJzg0BT8q9KDcz_5abtxe_4P","timestamp":1716256798358}]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
2 |
--------------------------------------------------------------------------------
/FE/public/Jenkins_logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/src/data preprocessing/preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Controllers"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 91,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "file_name = '../../datasets/Jenkins Docs QA.csv'\n",
17 | "questions_col = 'Question'\n",
18 | "answers_col = 'Answer'\n",
19 | "\n",
20 | "# file_name = '../../datasets/Community Questions Refined.csv'\n",
21 | "# questions_col = 'questions'\n",
22 | "# answers_col = 'answers'\n",
23 | "\n",
24 | "# file_name = '../../datasets/QueryResultsUpdated.csv'\n",
25 | "# questions_col = 'Question Body'\n",
26 | "# answers_col = 'Answer Body'\n",
27 | "max_sequence_length = 11000\n",
28 | "num_samples = 1000 # None"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "# Imports"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 92,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "from utils import *\n",
45 | "import pandas as pd\n",
46 | "from datasets import Dataset"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "# Load And Process Our Dataset - External Sources"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 93,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "raw_data = pd.read_csv(file_name)"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 94,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "765"
74 | ]
75 | },
76 | "execution_count": 94,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "len(raw_data)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 95,
88 | "metadata": {},
89 | "outputs": [
90 | {
91 | "data": {
92 | "text/html": [
93 | "
\n",
94 | "\n",
107 | "\n",
108 | " \n",
109 | " \n",
110 | " \n",
111 | " Question \n",
112 | " Answer \n",
113 | " \n",
114 | " \n",
115 | " \n",
116 | " \n",
117 | " 0 \n",
118 | " What is Jenkins? \n",
119 | " Jenkins is a self-contained, open-source autom... \n",
120 | " \n",
121 | " \n",
122 | " 1 \n",
123 | " What should I refer to if I want to develop my... \n",
124 | " If you want to extend the functionality of Jen... \n",
125 | " \n",
126 | " \n",
127 | "
\n",
128 | ""
129 | ],
130 | "text/plain": [
131 | " Question \n",
132 | "0 What is Jenkins? \\\n",
133 | "1 What should I refer to if I want to develop my... \n",
134 | "\n",
135 | " Answer \n",
136 | "0 Jenkins is a self-contained, open-source autom... \n",
137 | "1 If you want to extend the functionality of Jen... "
138 | ]
139 | },
140 | "execution_count": 95,
141 | "metadata": {},
142 | "output_type": "execute_result"
143 | }
144 | ],
145 | "source": [
146 | "raw_data.head(2)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 96,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "# Define condition to drop rows where answers_col has a code tag in\n",
156 | "condition = raw_data[answers_col].str.contains('')\n",
157 | "\n",
158 | "# Drop rows based on the condition\n",
159 | "raw_data = raw_data[~condition]"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 97,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "data": {
169 | "text/plain": [
170 | "765"
171 | ]
172 | },
173 | "execution_count": 97,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | }
177 | ],
178 | "source": [
179 | "len(raw_data)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 98,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "data": {
189 | "text/html": [
190 | "\n",
191 | "\n",
204 | "\n",
205 | " \n",
206 | " \n",
207 | " \n",
208 | " Question \n",
209 | " Answer \n",
210 | " \n",
211 | " \n",
212 | " \n",
213 | " \n",
214 | " 0 \n",
215 | " What is Jenkins? \n",
216 | " Jenkins is a self-contained, open-source autom... \n",
217 | " \n",
218 | " \n",
219 | " 1 \n",
220 | " What should I refer to if I want to develop my... \n",
221 | " If you want to extend the functionality of Jen... \n",
222 | " \n",
223 | " \n",
224 | " 2 \n",
225 | " What is the purpose of this Jenkins documentat... \n",
226 | " The purpose of this documentation is to help u... \n",
227 | " \n",
228 | " \n",
229 | " 3 \n",
230 | " What is a good starting point for users who ha... \n",
231 | " Users who have never used Jenkins or have limi... \n",
232 | " \n",
233 | " \n",
234 | " 4 \n",
235 | " Where can I find more detailed information abo... \n",
236 | " For more detailed information about using Jenk... \n",
237 | " \n",
238 | " \n",
239 | "
\n",
240 | ""
241 | ],
242 | "text/plain": [
243 | " Question \n",
244 | "0 What is Jenkins? \\\n",
245 | "1 What should I refer to if I want to develop my... \n",
246 | "2 What is the purpose of this Jenkins documentat... \n",
247 | "3 What is a good starting point for users who ha... \n",
248 | "4 Where can I find more detailed information abo... \n",
249 | "\n",
250 | " Answer \n",
251 | "0 Jenkins is a self-contained, open-source autom... \n",
252 | "1 If you want to extend the functionality of Jen... \n",
253 | "2 The purpose of this documentation is to help u... \n",
254 | "3 Users who have never used Jenkins or have limi... \n",
255 | "4 For more detailed information about using Jenk... "
256 | ]
257 | },
258 | "execution_count": 98,
259 | "metadata": {},
260 | "output_type": "execute_result"
261 | }
262 | ],
263 | "source": [
264 | "raw_data.head(5)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 99,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "data": {
274 | "text/plain": [
275 | "0 Jenkins is a self-contained, open-source autom...\n",
276 | "1 If you want to extend the functionality of Jen...\n",
277 | "2 The purpose of this documentation is to help u...\n",
278 | "3 Users who have never used Jenkins or have limi...\n",
279 | "4 For more detailed information about using Jenk...\n",
280 | " ... \n",
281 | "760 File fingerprinting in Jenkins is used to trac...\n",
282 | "761 To set up file fingerprinting in Jenkins, rele...\n",
283 | "762 Jenkins stores the MD5 checksum (fingerprint) ...\n",
284 | "763 Jenkins users can benefit from file fingerprin...\n",
285 | "764 Some typical scenarios where file fingerprinti...\n",
286 | "Name: Answer, Length: 765, dtype: object"
287 | ]
288 | },
289 | "execution_count": 99,
290 | "metadata": {},
291 | "output_type": "execute_result"
292 | }
293 | ],
294 | "source": [
295 | "raw_data[answers_col]"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 100,
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "# Define condition to drop rows where questions_col has code tag\n",
305 | "condition = raw_data[questions_col].str.contains('')\n",
306 | "\n",
307 | "# Drop rows based on the condition\n",
308 | "raw_data = raw_data[~condition]"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 101,
314 | "metadata": {},
315 | "outputs": [
316 | {
317 | "data": {
318 | "text/plain": [
319 | "765"
320 | ]
321 | },
322 | "execution_count": 101,
323 | "metadata": {},
324 | "output_type": "execute_result"
325 | }
326 | ],
327 | "source": [
328 | "len(raw_data)"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": 102,
334 | "metadata": {},
335 | "outputs": [
336 | {
337 | "data": {
338 | "text/html": [
339 | "\n",
340 | "\n",
353 | "\n",
354 | " \n",
355 | " \n",
356 | " \n",
357 | " Question \n",
358 | " Answer \n",
359 | " \n",
360 | " \n",
361 | " \n",
362 | " \n",
363 | " 0 \n",
364 | " What is Jenkins? \n",
365 | " Jenkins is a self-contained, open-source autom... \n",
366 | " \n",
367 | " \n",
368 | " 1 \n",
369 | " What should I refer to if I want to develop my... \n",
370 | " If you want to extend the functionality of Jen... \n",
371 | " \n",
372 | " \n",
373 | " 2 \n",
374 | " What is the purpose of this Jenkins documentat... \n",
375 | " The purpose of this documentation is to help u... \n",
376 | " \n",
377 | " \n",
378 | " 3 \n",
379 | " What is a good starting point for users who ha... \n",
380 | " Users who have never used Jenkins or have limi... \n",
381 | " \n",
382 | " \n",
383 | " 4 \n",
384 | " Where can I find more detailed information abo... \n",
385 | " For more detailed information about using Jenk... \n",
386 | " \n",
387 | " \n",
388 | "
\n",
389 | ""
390 | ],
391 | "text/plain": [
392 | " Question \n",
393 | "0 What is Jenkins? \\\n",
394 | "1 What should I refer to if I want to develop my... \n",
395 | "2 What is the purpose of this Jenkins documentat... \n",
396 | "3 What is a good starting point for users who ha... \n",
397 | "4 Where can I find more detailed information abo... \n",
398 | "\n",
399 | " Answer \n",
400 | "0 Jenkins is a self-contained, open-source autom... \n",
401 | "1 If you want to extend the functionality of Jen... \n",
402 | "2 The purpose of this documentation is to help u... \n",
403 | "3 Users who have never used Jenkins or have limi... \n",
404 | "4 For more detailed information about using Jenk... "
405 | ]
406 | },
407 | "execution_count": 102,
408 | "metadata": {},
409 | "output_type": "execute_result"
410 | }
411 | ],
412 | "source": [
413 | "raw_data.head()"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 104,
419 | "metadata": {},
420 | "outputs": [
421 | {
422 | "name": "stdout",
423 | "output_type": "stream",
424 | "text": [
425 | "length of our dataset: 765\n"
426 | ]
427 | }
428 | ],
429 | "source": [
430 | "# Combine columns into the desired format\n",
431 | "df = pd.DataFrame(columns=['text'])\n",
432 | "\n",
433 | "df['text'] = raw_data.apply(\n",
434 | " lambda row: f\"[INST] {row[questions_col].strip()} [/INST] {row[answers_col]} \",\n",
435 | " axis=1\n",
436 | ")\n",
437 | "print(f'length of our dataset: {len(df)}')"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 5,
443 | "metadata": {},
444 | "outputs": [
445 | {
446 | "name": "stdout",
447 | "output_type": "stream",
448 | "text": [
449 | "The maximum row length is: 30508\n"
450 | ]
451 | }
452 | ],
453 | "source": [
454 | "# Calculate the length of each row\n",
455 | "df['row_length'] = df['text'].apply(len)\n",
456 | "\n",
457 | "# Find the maximum row length\n",
458 | "max_length = df['row_length'].max()\n",
459 | "print(f\"The maximum row length is: {max_length}\")"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 6,
465 | "metadata": {},
466 | "outputs": [
467 | {
468 | "data": {
469 | "image/png": "",
470 | "text/plain": [
471 | ""
472 | ]
473 | },
474 | "metadata": {},
475 | "output_type": "display_data"
476 | }
477 | ],
478 | "source": [
479 | "plot_histogram(df['row_length'])"
480 | ]
481 | },
482 | {
483 | "cell_type": "code",
484 | "execution_count": 7,
485 | "metadata": {},
486 | "outputs": [
487 | {
488 | "name": "stdout",
489 | "output_type": "stream",
490 | "text": [
491 | "3391\n"
492 | ]
493 | }
494 | ],
495 | "source": [
496 | "# Filter out rows with length more than max_sequence_length (i.e. 11000) to obtain just acceptable length of input\n",
497 | "df_filtered = df[df['row_length'] <= max_sequence_length]\n",
498 | "\n",
499 | "# Display the filtered DataFrame\n",
500 | "print(len(df_filtered))"
501 | ]
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": 8,
506 | "metadata": {},
507 | "outputs": [
508 | {
509 | "data": {
510 | "image/png": "",
511 | "text/plain": [
512 | ""
513 | ]
514 | },
515 | "metadata": {},
516 | "output_type": "display_data"
517 | }
518 | ],
519 | "source": [
520 | "plot_histogram(df_filtered['row_length'])"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": 9,
526 | "metadata": {},
527 | "outputs": [],
528 | "source": [
529 | "# drop un needed columns\n",
530 | "df_filtered = df_filtered.drop(['row_length'], axis=1)"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 10,
536 | "metadata": {},
537 | "outputs": [],
538 | "source": [
539 | "# convert to dataset and select num_samples (i.e. 1000) record for now (considering memory resource)\n",
540 | "dataset = Dataset.from_pandas(df_filtered)\n",
541 | "\n",
542 | "if num_samples: \n",
543 | " dataset = dataset.select(range(num_samples))"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": 11,
549 | "metadata": {},
550 | "outputs": [
551 | {
552 | "data": {
553 | "text/plain": [
554 | "Dataset({\n",
555 | " features: ['text', '__index_level_0__'],\n",
556 | " num_rows: 1000\n",
557 | "})"
558 | ]
559 | },
560 | "execution_count": 11,
561 | "metadata": {},
562 | "output_type": "execute_result"
563 | }
564 | ],
565 | "source": [
566 | "dataset"
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": null,
572 | "metadata": {},
573 | "outputs": [],
574 | "source": []
575 | }
576 | ],
577 | "metadata": {
578 | "kernelspec": {
579 | "display_name": "Python 3",
580 | "language": "python",
581 | "name": "python3"
582 | },
583 | "language_info": {
584 | "codemirror_mode": {
585 | "name": "ipython",
586 | "version": 3
587 | },
588 | "file_extension": ".py",
589 | "mimetype": "text/x-python",
590 | "name": "python",
591 | "nbconvert_exporter": "python",
592 | "pygments_lexer": "ipython3",
593 | "version": "3.11.3"
594 | }
595 | },
596 | "nbformat": 4,
597 | "nbformat_minor": 2
598 | }
599 |
--------------------------------------------------------------------------------