├── src ├── test │ └── java │ │ └── lunatix │ │ └── ragscan │ │ └── RagscanApplicationTests.java └── main │ ├── java │ └── lunatix │ │ └── ragscan │ │ ├── loader │ │ ├── FileLoader.java │ │ ├── FileLoaderFactory.java │ │ ├── JsonFileLoader.java │ │ ├── TextFileLoader.java │ │ ├── OtherFileLoader.java │ │ ├── FileType.java │ │ ├── PdfFileLoader.java │ │ └── FilesReader.java │ │ ├── RagscanApplication.java │ │ ├── store │ │ ├── QdrantStoreConfigurations.java │ │ ├── EmbeddingConfigurations.java │ │ └── GeminiOpenAiEmbeddingModel.java │ │ └── gemini │ │ └── GeminiCommand.java │ └── resources │ ├── application.properties │ └── docs │ ├── file.json │ └── initial-design.md ├── docker-compose.yaml ├── .gitignore ├── .mvn └── wrapper │ └── maven-wrapper.properties ├── LICENSE ├── .github └── workflows │ └── build.yml ├── settings.xml ├── README.md ├── pom.xml ├── mvnw.cmd └── mvnw /src/test/java/lunatix/ragscan/RagscanApplicationTests.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan; 2 | 3 | import org.junit.jupiter.api.Test; 4 | import org.springframework.boot.test.context.SpringBootTest; 5 | 6 | @SpringBootTest 7 | class RagscanApplicationTests { 8 | 9 | @Test 10 | void contextLoads() { 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/loader/FileLoader.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.loader; 2 | 3 | import java.util.List; 4 | 5 | import org.springframework.ai.document.Document; 6 | import org.springframework.core.io.Resource; 7 | 8 | public interface FileLoader { 9 | 10 | public List load(Resource resource); 11 | } 12 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | qdrant: 3 | image: qdrant/qdrant:v1.13.0 4 | environment: 5 | QDRANT__SERVICE__GRPC_PORT: 6334 6 | QDRANT__SERVICE__REST_PORT: 6333 7 | ports: 8 | - "6333:6333" # tcp 9 | - "6334:6334" # grpc 10 | volumes: 11 | - ./qdrant_data:/qdrant/storage 12 | 13 | volumes: 14 | models_cache: 15 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/loader/FileLoaderFactory.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.loader; 2 | 3 | public class FileLoaderFactory { 4 | 5 | private FileLoaderFactory() {} 6 | 7 | public static FileLoader create(FileType fileType) { 8 | return switch (fileType) { 9 | case PDF -> new PdfFileLoader(); 10 | case TXT -> new TextFileLoader(); 11 | case JSON, HTML, XML, OTHER -> new OtherFileLoader(); 12 | }; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/RagscanApplication.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | import org.springframework.shell.command.annotation.CommandScan; 6 | 7 | @SpringBootApplication 8 | @CommandScan 9 | public class RagscanApplication { 10 | 11 | public static void main(String[] args) { 12 | SpringApplication.run(RagscanApplication.class, args); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | HELP.md 2 | target/ 3 | !.mvn/wrapper/maven-wrapper.jar 4 | !**/src/main/**/target/ 5 | !**/src/test/**/target/ 6 | 7 | ### STS ### 8 | .apt_generated 9 | .classpath 10 | .factorypath 11 | .project 12 | .settings 13 | .springBeans 14 | .sts4-cache 15 | 16 | ### IntelliJ IDEA ### 17 | .idea 18 | *.iws 19 | *.iml 20 | *.ipr 21 | 22 | ### NetBeans ### 23 | /nbproject/private/ 24 | /nbbuild/ 25 | /dist/ 26 | /nbdist/ 27 | /.nb-gradle/ 28 | build/ 29 | !**/src/main/**/build/ 30 | !**/src/test/**/build/ 31 | 32 | ### VS Code ### 33 | .vscode/ 34 | 35 | ### QDRANT 36 | /qdrant_data 37 | 38 | ### logs 39 | *.log 40 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/loader/JsonFileLoader.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.loader; 2 | 3 | import java.util.List; 4 | 5 | import lombok.extern.slf4j.Slf4j; 6 | import org.springframework.ai.document.Document; 7 | import org.springframework.ai.reader.JsonReader; 8 | import org.springframework.core.io.Resource; 9 | 10 | @Slf4j 11 | public class JsonFileLoader implements FileLoader { 12 | 13 | @Override 14 | public List load(Resource resource) { 15 | log.info("Loading json file {}", resource.getFilename()); 16 | final var jsonFile = new JsonReader(resource); 17 | return jsonFile.read(); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/loader/TextFileLoader.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.loader; 2 | 3 | import java.util.List; 4 | 5 | import lombok.extern.slf4j.Slf4j; 6 | import org.springframework.ai.document.Document; 7 | import org.springframework.ai.reader.TextReader; 8 | import org.springframework.core.io.Resource; 9 | 10 | @Slf4j 11 | public class TextFileLoader implements FileLoader { 12 | 13 | @Override 14 | public List load(Resource resource) { 15 | log.info("Loading text file {}", resource.getFilename()); 16 | final var textReader = new TextReader(resource); 17 | return textReader.read(); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/loader/OtherFileLoader.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.loader; 2 | 3 | import java.util.List; 4 | 5 | import lombok.extern.slf4j.Slf4j; 6 | import org.springframework.ai.document.Document; 7 | import org.springframework.ai.reader.tika.TikaDocumentReader; 8 | import org.springframework.core.io.Resource; 9 | 10 | @Slf4j 11 | public class OtherFileLoader implements FileLoader { 12 | 13 | /** 14 | * This Supports variant of files, for example DOCX, PPTX, HTML, XML etc. 15 | * click here for full list 16 | */ 17 | @Override 18 | public List load(Resource resource) { 19 | log.info("Loading file {}", resource.getFilename()); 20 | final var file = new TikaDocumentReader(resource); 21 | return file.read(); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /.mvn/wrapper/maven-wrapper.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | wrapperVersion=3.3.2 18 | distributionType=only-script 19 | distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.7/apache-maven-3.9.7-bin.zip 20 | -------------------------------------------------------------------------------- /src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | spring.application.name=ragscan 2 | 3 | spring.main.web-application-type=none 4 | server.port=9090 5 | 6 | spring.shell.interactive.enabled=true 7 | spring.shell.script.enabled=true 8 | 9 | spring.threads.virtual.enabled=true 10 | 11 | spring.ai.vectorstore.qdrant.host=localhost 12 | spring.ai.vectorstore.qdrant.port=6334 13 | spring.ai.vectorstore.qdrant.collection-name=ragscan 14 | spring.ai.vectorstore.qdrant.initialize-schema=true 15 | 16 | spring.ai.openai.api-key=${GOOGLE_API_KEY} 17 | spring.ai.openai.base-url=https://generativelanguage.googleapis.com 18 | spring.ai.openai.chat.options.model=gemini-2.0-flash 19 | spring.ai.openai.chat.completions-path=/v1beta/openai/chat/completions 20 | 21 | spring.ai.openai.embedding.api-key=${GOOGLE_API_KEY} 22 | spring.ai.openai.embedding.base-url=https://generativelanguage.googleapis.com 23 | spring.ai.openai.embedding.embeddings-path=/v1beta/openai/embeddings 24 | spring.ai.openai.embedding.options.model=gemini-embedding-exp-03-07 25 | spring.ai.openai.embedding.options.dimensions=768 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Aland Osman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/loader/FileType.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.loader; 2 | 3 | import java.nio.file.Path; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import lombok.AllArgsConstructor; 8 | import lombok.Getter; 9 | import lombok.extern.slf4j.Slf4j; 10 | import org.apache.commons.io.FilenameUtils; 11 | 12 | @Getter 13 | @AllArgsConstructor 14 | @Slf4j 15 | public enum FileType { 16 | HTML(List.of("htm", "html")), 17 | TXT(List.of("txt")), 18 | PDF(List.of("pdf")), 19 | JSON(List.of("json")), 20 | XML(List.of("xml")), 21 | OTHER(List.of()); 22 | 23 | private final List fileExtensions; 24 | 25 | public static FileType fromFileExtension(String fileExtension) { 26 | log.info("Getting fileType for {}", fileExtension); 27 | return Arrays.stream(FileType.values()) 28 | .filter(f -> f.getFileExtensions().contains(fileExtension.toLowerCase())) 29 | .findFirst() 30 | .orElse(OTHER); 31 | } 32 | 33 | public static String getFileExtension(Path filePath) { 34 | log.info("Getting file extension for {}", filePath); 35 | return FilenameUtils.getExtension(filePath.toString()); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/loader/PdfFileLoader.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.loader; 2 | 3 | import java.util.List; 4 | 5 | import lombok.extern.slf4j.Slf4j; 6 | import org.springframework.ai.document.Document; 7 | import org.springframework.ai.reader.ExtractedTextFormatter; 8 | import org.springframework.ai.reader.pdf.PagePdfDocumentReader; 9 | import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig; 10 | import org.springframework.core.io.Resource; 11 | 12 | @Slf4j 13 | public class PdfFileLoader implements FileLoader { 14 | 15 | @Override 16 | public List load(Resource resource) { 17 | log.info("Loading PDF Document {}", resource.getFilename()); 18 | PagePdfDocumentReader pdfFile = new PagePdfDocumentReader(resource, 19 | PdfDocumentReaderConfig.builder() 20 | .withPageTopMargin(0) 21 | .withPageExtractedTextFormatter(ExtractedTextFormatter.builder() 22 | .withNumberOfTopTextLinesToDelete(0) 23 | .build()) 24 | .withPagesPerDocument(1) 25 | .build()); 26 | 27 | return pdfFile.read(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/store/QdrantStoreConfigurations.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.store; 2 | 3 | import java.util.concurrent.Future; 4 | 5 | import io.qdrant.client.QdrantClient; 6 | import io.qdrant.client.grpc.Collections; 7 | import io.vavr.control.Try; 8 | import lombok.RequiredArgsConstructor; 9 | import org.springframework.beans.factory.annotation.Value; 10 | import org.springframework.shell.standard.ShellComponent; 11 | import org.springframework.shell.standard.ShellMethod; 12 | 13 | @RequiredArgsConstructor 14 | @ShellComponent 15 | public class QdrantStoreConfigurations { 16 | 17 | private final QdrantClient qdrantClient; 18 | 19 | @Value("${spring.ai.vectorstore.qdrant.collection-name}") 20 | private String collectionName; 21 | 22 | @ShellMethod( 23 | key = "collection-size", 24 | value = """ 25 | Give it a desired collection size 26 | """, 27 | group = "Prerequisite") 28 | public void saveCollectionSize(int size) { 29 | Try.of(() -> Collections.VectorParams.newBuilder().setSize(size) 30 | .setDistance(Collections.Distance.Cosine) 31 | .build()) 32 | .map(vectorParams -> qdrantClient.recreateCollectionAsync(collectionName, vectorParams)) 33 | .andThenTry(Future::get); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build Ragscan 2 | on: [push, pull_request] 3 | jobs: 4 | build-with-graal: 5 | if: false # currently disabled 6 | name: Ragscan on ${{ matrix.os }} 7 | runs-on: ${{ matrix.os }} 8 | strategy: 9 | matrix: 10 | os: [windows-latest, ubuntu-latest] 11 | steps: 12 | - uses: actions/checkout@v4 13 | - uses: graalvm/setup-graalvm@v1 14 | with: 15 | java-version: '22' 16 | distribution: 'graalvm' 17 | github-token: ${{ secrets.GITHUB_TOKEN }} 18 | native-image-job-reports: 'true' 19 | 20 | - name: Build Ragscan 21 | run: | 22 | echo "GRAALVM_HOME: $GRAALVM_HOME" 23 | echo "JAVA_HOME: $JAVA_HOME" 24 | java --version 25 | native-image --version 26 | - name: Compile with maven 27 | run: mvn -X -Pnative native:compile -DskipTests 28 | 29 | - name: Upload binary 30 | uses: actions/upload-artifact@v4 31 | with: 32 | name: ragscan-${{ matrix.os }} 33 | path: ragscan* 34 | build: 35 | name: Ragscan 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@v4 39 | - name: Set up Java 40 | uses: actions/setup-java@v4 41 | with: 42 | distribution: oracle 43 | java-version: 21 44 | - name: Build Ragscan 45 | run: mvn -f pom.xml clean package -DskipTests 46 | - name: Upload build artifact 47 | if: github.ref_name == 'master' 48 | uses: actions/upload-artifact@v4 49 | with: 50 | name: artifact 51 | path: ./target/*.jar -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/store/EmbeddingConfigurations.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.store; 2 | 3 | import io.micrometer.observation.ObservationRegistry; 4 | import org.springframework.ai.embedding.EmbeddingModel; 5 | import org.springframework.ai.openai.OpenAiEmbeddingOptions; 6 | import org.springframework.ai.openai.api.OpenAiApi; 7 | import org.springframework.beans.factory.annotation.Value; 8 | import org.springframework.context.annotation.Bean; 9 | import org.springframework.context.annotation.Configuration; 10 | import org.springframework.retry.support.RetryTemplate; 11 | 12 | @Configuration 13 | class EmbeddingConfigurations { 14 | 15 | @Value("${spring.ai.openai.api-key}") 16 | private String apiKey; 17 | 18 | @Value("${spring.ai.openai.base-url}") 19 | private String baseUrl; 20 | 21 | 22 | @Value("${spring.ai.openai.embedding.options.model}") 23 | private String embeddingModel; 24 | 25 | @Value("${spring.ai.openai.embedding.embeddings-path}") 26 | private String embeddingPath; 27 | 28 | @Value("${spring.ai.openai.embedding.options.dimensions}") 29 | private Integer embeddingDimension; 30 | 31 | @Bean 32 | EmbeddingModel embeddingModel() { 33 | final var openAiApi = OpenAiApi.builder() 34 | .apiKey(apiKey) 35 | .baseUrl(baseUrl) 36 | .embeddingsPath(embeddingPath) 37 | .build(); 38 | return new GeminiOpenAiEmbeddingModel( 39 | openAiApi, 40 | OpenAiEmbeddingOptions.builder() 41 | .model(embeddingModel) 42 | .dimensions(embeddingDimension) 43 | .build(), 44 | RetryTemplate.builder() 45 | .maxAttempts(10) 46 | .fixedBackoff(1000) 47 | .build(), 48 | ObservationRegistry.create() 49 | ); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /settings.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | 7 | spring-milestones 8 | https://repo.spring.io/milestone 9 | * 10 | 11 | 12 | spring-snapshots 13 | https://repo.spring.io/snapshot 14 | * 15 | 16 | 17 | central 18 | spring-snapshots 19 | https://repo.maven.apache.org/maven2 20 | 21 | 22 | 23 | 24 | 25 | ok 26 | 27 | 28 | spring-milestones 29 | Spring Milestones 30 | https://repo.spring.io/milestone 31 | 32 | false 33 | 34 | 35 | 36 | 37 | false 38 | 39 | central 40 | Maven Repository Switchboard 41 | https://repo1.maven.org/maven2 42 | 43 | 44 | spring-snapshots 45 | Spring Snapshots 46 | https://repo.spring.io/snapshot 47 | 48 | false 49 | 50 | 51 | 52 | 53 | 54 | 55 | ok 56 | 57 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/gemini/GeminiCommand.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.gemini; 2 | 3 | import java.util.List; 4 | 5 | import lombok.RequiredArgsConstructor; 6 | import org.springframework.ai.chat.messages.Message; 7 | import org.springframework.ai.chat.messages.SystemMessage; 8 | import org.springframework.ai.chat.messages.UserMessage; 9 | import org.springframework.ai.chat.model.ChatModel; 10 | import org.springframework.ai.document.Document; 11 | import org.springframework.ai.vectorstore.SearchRequest; 12 | import org.springframework.ai.vectorstore.VectorStore; 13 | import org.springframework.shell.standard.ShellComponent; 14 | import org.springframework.shell.standard.ShellMethod; 15 | 16 | @ShellComponent 17 | @RequiredArgsConstructor 18 | public class GeminiCommand { 19 | 20 | private final VectorStore vectorStore; 21 | private final ChatModel chatModel; 22 | 23 | @ShellMethod( 24 | key = "ask", 25 | value = "Ask a question, Note: you need to load files", 26 | group = "Chat") 27 | private String ask(String question) { 28 | final var resultInDB = vectorStore.similaritySearch( 29 | SearchRequest.builder() 30 | .query(question) 31 | .topK(5) 32 | .build() 33 | ); 34 | assert resultInDB != null && !resultInDB.isEmpty() 35 | : "error getting context"; 36 | 37 | final var generatedPrompt = getGenerateContentRequest(question, resultInDB) 38 | .toArray(new Message[0]); 39 | return chatModel.call(generatedPrompt); 40 | } 41 | 42 | private static List getGenerateContentRequest(String message, List resultInDB) { 43 | final var systemContent = """ 44 | Answer only from the data you got as input, otherwise say you don't know, and clean up weird formats 45 | like if it's json clean it up, if it's Markdown clean it up etc... 46 | Your name is Ragscan. 47 | You will always get some questions with some context. Use the context only. 48 | """; 49 | final var systemMessage = new SystemMessage(systemContent); 50 | final var messageContent = """ 51 | Question: %s 52 | Context: %s 53 | """; 54 | 55 | final var userMessage = new UserMessage(String.format(messageContent, message, resultInDB.toString())); 56 | 57 | return List.of( 58 | systemMessage, 59 | userMessage 60 | ); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/loader/FilesReader.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.loader; 2 | 3 | import java.nio.file.Files; 4 | import java.nio.file.Path; 5 | import java.nio.file.Paths; 6 | import java.util.List; 7 | 8 | import io.vavr.control.Try; 9 | import lombok.RequiredArgsConstructor; 10 | import lombok.extern.slf4j.Slf4j; 11 | import org.springframework.ai.transformer.splitter.TextSplitter; 12 | import org.springframework.ai.transformer.splitter.TokenTextSplitter; 13 | import org.springframework.ai.vectorstore.VectorStore; 14 | import org.springframework.core.io.InputStreamResource; 15 | import org.springframework.shell.standard.ShellComponent; 16 | import org.springframework.shell.standard.ShellMethod; 17 | 18 | @RequiredArgsConstructor 19 | @ShellComponent 20 | @Slf4j 21 | public class FilesReader { 22 | 23 | private final VectorStore vectorStore; 24 | 25 | @ShellMethod( 26 | key = "load", 27 | value = "Give it the main folder and it will load supported files inside of it", 28 | group = "Prerequisite") 29 | public String loadFiles(String fullPath) { 30 | return Try.withResources(() -> Files.walk(Path.of(fullPath))) 31 | .of(paths -> paths 32 | .filter(Files::isRegularFile) 33 | .map(path -> { 34 | final var fileExtension = FileType.getFileExtension(path.getFileName()); 35 | final var fileType = FileType.fromFileExtension(fileExtension); 36 | final var fileLoader = FileLoaderFactory.create(fileType); 37 | return Try.of(() -> Files.newInputStream(path)) 38 | .map(InputStreamResource::new) 39 | .map(fileLoader::load) 40 | .peek(documents -> { 41 | final var splitter = new TokenTextSplitter(); 42 | final var splitDocuments = splitter.apply(documents); 43 | log.info("Adding documents..."); 44 | vectorStore.accept(splitDocuments); 45 | log.info("added {} documents", splitDocuments.size()); 46 | }) 47 | .get(); 48 | }) 49 | .toList() 50 | ) 51 | .map(ignored -> "loaded Successfully") 52 | .getOrElseThrow(throwable -> new RuntimeException("Error while loading file", throwable)); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Simple CLI Retrieval Augmented Generation Scanner 2 | ================================================= 3 | Aim of the project: A showcase of a RAG scanner written in Java and using [Spring AI](https://docs.spring.io/spring-ai/reference/api/index.html), which scans the targeted documents and you can ask questions to the LLM regarding the given documents. 4 | 5 | ## Disclaimer 6 | This tool is intended for educational and productivity purposes only. It is designed to assist users in managing and querying their own documents. Any illegal or unethical use of this software is strictly prohibited. 7 | 8 | ## Requirements 9 | 1. [Java 21](https://www.oracle.com/java/technologies/javase/jdk21-archive-downloads.html) installed on your device 10 | 2. [Docker](https://www.docker.com/products/docker-desktop/) 11 | 3. An environment variable named `GOOGLE_API_KEY` and add your [Google Gemini API key](https://ai.google.dev/gemini-api/docs/api-key) 12 | 13 | ## Installation 14 | 1. Navigate to the project directory 15 | 2. Open CMD/Powershell/Terminal 16 | 3. For Windows Run `./mvnw clean install`, for Linux/Mac run `./mvn clean install` 17 | 18 | ## How to use: 19 | 1. Run `docker-compose up` in your CMD/Powershell/Terminal 20 | 2. Run the project using maven, on Windows: `./mvnw spring-boot:run`, on Linux/Mac run `./mvn spring-boot:run`. 21 | 3. When the shell opens type `collection-size 768` (for Gemini `768` is compatible). 22 | 4. Place your files in a directory, copy the full path of the directory, and run something like this `load //your//path`, wait till the files are chunked and loaded to `Qdrant vector database`. 23 | 5. Finally in the shell write `ask "your question here"` and that's it. 24 | 25 | 26 | ### Notes 27 | It's a simple project, needs a lot of improvements like: 28 | 1. Improve chunking documents (Currently chunked by token size) 29 | 2. Support more file types (Currently supports txt, HTML, JSON, MD, docx, ppt, pdf, and a lot more) 30 | 3. Support other Chat models and Embeddings like GPT, Ollama, etc... (currently supports Gemini version `gemini-2.0-flash` and embedding `gemini-embedding-exp-03-07`, the reason I decided to use Gemini is that it has a good free tier) 31 | 4. Support to make it a standalone executable and a jar file, (Currently you can build it yourself and run it, it has no problem, but I will simplify it) 32 | 5. Support other vector databases ( Currently supports Qdrant, to be honest, it's good enough) 33 | 6. Support custom System Context and custom similar returned documents in DB (Default, for now, is 5.) 34 | 35 | #### Rabbit hole 36 | Don't try to retrieve an API key from older `.git` versions, it's a rabbit hole :) 37 | 38 | Please create an Issue, if something is wrong I will look into it, and feel free to contribute to the project. 39 | ============== 40 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | org.springframework.boot 7 | spring-boot-starter-parent 8 | 3.3.1 9 | 10 | 11 | lunatix 12 | ragscan 13 | 0.0.1-SNAPSHOT 14 | ragscan 15 | ragscan 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 21 31 | 1.0.0-SNAPSHOT 32 | 3.3.0 33 | 21 34 | 21 35 | 36 | 37 | 38 | org.springframework.ai 39 | spring-ai-qdrant-store-spring-boot-starter 40 | 41 | 42 | org.springframework.shell 43 | spring-shell-starter 44 | 45 | 46 | org.jsoup 47 | jsoup 48 | 1.18.1 49 | 50 | 51 | org.projectlombok 52 | lombok 53 | true 54 | 55 | 56 | org.springframework.boot 57 | spring-boot-starter-test 58 | test 59 | 60 | 61 | org.springframework.shell 62 | spring-shell-starter-test 63 | test 64 | 65 | 66 | org.springframework.ai 67 | spring-ai-tika-document-reader 68 | 1.0.0-SNAPSHOT 69 | 70 | 71 | org.apache.maven.plugins 72 | maven-source-plugin 73 | 3.3.1 74 | 75 | 76 | org.graalvm.buildtools 77 | native-maven-plugin 78 | 0.10.2 79 | 80 | 81 | org.springframework.boot 82 | spring-boot-maven-plugin 83 | 3.3.1 84 | 85 | 86 | org.springframework.ai 87 | spring-ai-pdf-document-reader 88 | 1.0.0-SNAPSHOT 89 | 90 | 91 | org.springframework.ai 92 | spring-ai-openai-spring-boot-starter 93 | 94 | 95 | org.springframework.ai 96 | spring-ai-qdrant-store-spring-boot-starter 97 | 98 | 99 | org.apache.httpcomponents.client5 100 | httpclient5 101 | 5.2.1 102 | 103 | 104 | 105 | io.vavr 106 | vavr 107 | 0.10.4 108 | 109 | 110 | 111 | 112 | 113 | org.springframework.ai 114 | spring-ai-bom 115 | ${spring-ai.version} 116 | pom 117 | import 118 | 119 | 120 | org.springframework.shell 121 | spring-shell-dependencies 122 | ${spring-shell.version} 123 | pom 124 | import 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | org.graalvm.buildtools 133 | native-maven-plugin 134 | 0.10.2 135 | 136 | 137 | org.springframework.boot 138 | spring-boot-maven-plugin 139 | 140 | 141 | 142 | org.projectlombok 143 | lombok 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | spring-snapshots 153 | Spring Snapshots 154 | https://repo.spring.io/snapshot 155 | 156 | false 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /src/main/java/lunatix/ragscan/store/GeminiOpenAiEmbeddingModel.java: -------------------------------------------------------------------------------- 1 | package lunatix.ragscan.store; 2 | 3 | import java.util.List; 4 | import java.util.Objects; 5 | 6 | import io.micrometer.observation.ObservationConvention; 7 | import io.micrometer.observation.ObservationRegistry; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | import org.springframework.ai.chat.metadata.DefaultUsage; 11 | import org.springframework.ai.embedding.Embedding; 12 | import org.springframework.ai.embedding.EmbeddingOptions; 13 | import org.springframework.ai.embedding.EmbeddingRequest; 14 | import org.springframework.ai.embedding.EmbeddingResponse; 15 | import org.springframework.ai.embedding.EmbeddingResponseMetadata; 16 | import org.springframework.ai.embedding.observation.DefaultEmbeddingModelObservationConvention; 17 | import org.springframework.ai.embedding.observation.EmbeddingModelObservationContext; 18 | import org.springframework.ai.embedding.observation.EmbeddingModelObservationDocumentation; 19 | import org.springframework.ai.model.ModelOptionsUtils; 20 | import org.springframework.ai.openai.OpenAiEmbeddingModel; 21 | import org.springframework.ai.openai.OpenAiEmbeddingOptions; 22 | import org.springframework.ai.openai.api.OpenAiApi; 23 | import org.springframework.ai.openai.api.common.OpenAiApiConstants; 24 | import org.springframework.lang.Nullable; 25 | import org.springframework.retry.support.RetryTemplate; 26 | 27 | public class GeminiOpenAiEmbeddingModel extends OpenAiEmbeddingModel { 28 | 29 | private static final Logger logger = LoggerFactory.getLogger(GeminiOpenAiEmbeddingModel.class); 30 | 31 | 32 | private static final ObservationConvention DEFAULT_OBSERVATION_CONVENTION = new DefaultEmbeddingModelObservationConvention(); 33 | private static final ObservationConvention OBSERVATION_CONVENTION = DEFAULT_OBSERVATION_CONVENTION; 34 | private final OpenAiEmbeddingOptions defaultOptions; 35 | 36 | private final RetryTemplate retryTemplate; 37 | private final ObservationRegistry observationRegistry; 38 | private final OpenAiApi openAiApi; 39 | 40 | public GeminiOpenAiEmbeddingModel(OpenAiApi openAiApi, 41 | OpenAiEmbeddingOptions defaultOptions, 42 | RetryTemplate retryTemplate, 43 | ObservationRegistry observationRegistry) { 44 | super(openAiApi); 45 | this.defaultOptions = defaultOptions; 46 | this.retryTemplate = retryTemplate; 47 | this.observationRegistry = observationRegistry; 48 | this.openAiApi = openAiApi; 49 | } 50 | 51 | @Override 52 | public EmbeddingResponse call(EmbeddingRequest request) { 53 | OpenAiEmbeddingOptions requestOptions = mergeOptions(request.getOptions(), this.defaultOptions); 54 | OpenAiApi.EmbeddingRequest> apiRequest = createRequest(request, requestOptions); 55 | 56 | var observationContext = EmbeddingModelObservationContext.builder() 57 | .embeddingRequest(request) 58 | .provider(OpenAiApiConstants.PROVIDER_NAME) 59 | .requestOptions(requestOptions) 60 | .build(); 61 | 62 | return Objects.requireNonNull(EmbeddingModelObservationDocumentation.EMBEDDING_MODEL_OPERATION 63 | .observation(OBSERVATION_CONVENTION, DEFAULT_OBSERVATION_CONVENTION, () -> observationContext, 64 | this.observationRegistry) 65 | .observe(() -> { 66 | OpenAiApi.EmbeddingList apiEmbeddingResponse = this.retryTemplate 67 | .execute(ctx -> this.openAiApi.embeddings(apiRequest).getBody()); 68 | 69 | if (apiEmbeddingResponse == null) { 70 | logger.warn("No embeddings returned for request: {}", request); 71 | return new EmbeddingResponse(List.of()); 72 | } 73 | 74 | var metadata = new EmbeddingResponseMetadata(apiEmbeddingResponse.model(), 75 | getDefaultUsage(apiEmbeddingResponse.usage())); 76 | 77 | List embeddings = apiEmbeddingResponse.data() 78 | .stream() 79 | .map(e -> new Embedding(e.embedding(), e.index())) 80 | .toList(); 81 | 82 | EmbeddingResponse embeddingResponse = new EmbeddingResponse(embeddings, metadata); 83 | 84 | observationContext.setResponse(embeddingResponse); 85 | 86 | return embeddingResponse; 87 | })); 88 | } 89 | 90 | private OpenAiEmbeddingOptions mergeOptions(@Nullable EmbeddingOptions runtimeOptions, 91 | OpenAiEmbeddingOptions defaultOptions) { 92 | var runtimeOptionsForProvider = ModelOptionsUtils.copyToTarget(runtimeOptions, EmbeddingOptions.class, 93 | OpenAiEmbeddingOptions.class); 94 | 95 | if (runtimeOptionsForProvider == null) { 96 | return defaultOptions; 97 | } 98 | 99 | return OpenAiEmbeddingOptions.builder() 100 | // Handle portable embedding options 101 | .model(ModelOptionsUtils.mergeOption(runtimeOptionsForProvider.getModel(), defaultOptions.getModel())) 102 | .dimensions(ModelOptionsUtils.mergeOption(runtimeOptionsForProvider.getDimensions(), 103 | defaultOptions.getDimensions())) 104 | // Handle OpenAI specific embedding options 105 | .encodingFormat(ModelOptionsUtils.mergeOption(runtimeOptionsForProvider.getEncodingFormat(), 106 | defaultOptions.getEncodingFormat())) 107 | .user(ModelOptionsUtils.mergeOption(runtimeOptionsForProvider.getUser(), defaultOptions.getUser())) 108 | .build(); 109 | } 110 | 111 | private OpenAiApi.EmbeddingRequest> createRequest(EmbeddingRequest request, 112 | OpenAiEmbeddingOptions requestOptions) { 113 | return new OpenAiApi.EmbeddingRequest<>(request.getInstructions(), requestOptions.getModel(), 114 | requestOptions.getEncodingFormat(), requestOptions.getDimensions(), requestOptions.getUser()); 115 | } 116 | 117 | /* 118 | Because Gemini doesn't provide default usage we will use a mock data, otherwise we will get an NPE exception 119 | that's why this class is created. 120 | */ 121 | private DefaultUsage getDefaultUsage(OpenAiApi.Usage usage) { 122 | return new DefaultUsage(Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE, usage); 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /mvnw.cmd: -------------------------------------------------------------------------------- 1 | <# : batch portion 2 | @REM ---------------------------------------------------------------------------- 3 | @REM Licensed to the Apache Software Foundation (ASF) under one 4 | @REM or more contributor license agreements. See the NOTICE file 5 | @REM distributed with this work for additional information 6 | @REM regarding copyright ownership. The ASF licenses this file 7 | @REM to you under the Apache License, Version 2.0 (the 8 | @REM "License"); you may not use this file except in compliance 9 | @REM with the License. You may obtain a copy of the License at 10 | @REM 11 | @REM https://www.apache.org/licenses/LICENSE-2.0 12 | @REM 13 | @REM Unless required by applicable law or agreed to in writing, 14 | @REM software distributed under the License is distributed on an 15 | @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | @REM KIND, either express or implied. See the License for the 17 | @REM specific language governing permissions and limitations 18 | @REM under the License. 19 | @REM ---------------------------------------------------------------------------- 20 | 21 | @REM ---------------------------------------------------------------------------- 22 | @REM Apache Maven Wrapper startup batch script, version 3.3.2 23 | @REM 24 | @REM Optional ENV vars 25 | @REM MVNW_REPOURL - repo url base for downloading maven distribution 26 | @REM MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven 27 | @REM MVNW_VERBOSE - true: enable verbose log; others: silence the output 28 | @REM ---------------------------------------------------------------------------- 29 | 30 | @IF "%__MVNW_ARG0_NAME__%"=="" (SET __MVNW_ARG0_NAME__=%~nx0) 31 | @SET __MVNW_CMD__= 32 | @SET __MVNW_ERROR__= 33 | @SET __MVNW_PSMODULEP_SAVE=%PSModulePath% 34 | @SET PSModulePath= 35 | @FOR /F "usebackq tokens=1* delims==" %%A IN (`powershell -noprofile "& {$scriptDir='%~dp0'; $script='%__MVNW_ARG0_NAME__%'; icm -ScriptBlock ([Scriptblock]::Create((Get-Content -Raw '%~f0'))) -NoNewScope}"`) DO @( 36 | IF "%%A"=="MVN_CMD" (set __MVNW_CMD__=%%B) ELSE IF "%%B"=="" (echo %%A) ELSE (echo %%A=%%B) 37 | ) 38 | @SET PSModulePath=%__MVNW_PSMODULEP_SAVE% 39 | @SET __MVNW_PSMODULEP_SAVE= 40 | @SET __MVNW_ARG0_NAME__= 41 | @SET MVNW_USERNAME= 42 | @SET MVNW_PASSWORD= 43 | @IF NOT "%__MVNW_CMD__%"=="" (%__MVNW_CMD__% %*) 44 | @echo Cannot start maven from wrapper >&2 && exit /b 1 45 | @GOTO :EOF 46 | : end batch / begin powershell #> 47 | 48 | $ErrorActionPreference = "Stop" 49 | if ($env:MVNW_VERBOSE -eq "true") { 50 | $VerbosePreference = "Continue" 51 | } 52 | 53 | # calculate distributionUrl, requires .mvn/wrapper/maven-wrapper.properties 54 | $distributionUrl = (Get-Content -Raw "$scriptDir/.mvn/wrapper/maven-wrapper.properties" | ConvertFrom-StringData).distributionUrl 55 | if (!$distributionUrl) { 56 | Write-Error "cannot read distributionUrl property in $scriptDir/.mvn/wrapper/maven-wrapper.properties" 57 | } 58 | 59 | switch -wildcard -casesensitive ( $($distributionUrl -replace '^.*/','') ) { 60 | "maven-mvnd-*" { 61 | $USE_MVND = $true 62 | $distributionUrl = $distributionUrl -replace '-bin\.[^.]*$',"-windows-amd64.zip" 63 | $MVN_CMD = "mvnd.cmd" 64 | break 65 | } 66 | default { 67 | $USE_MVND = $false 68 | $MVN_CMD = $script -replace '^mvnw','mvn' 69 | break 70 | } 71 | } 72 | 73 | # apply MVNW_REPOURL and calculate MAVEN_HOME 74 | # maven home pattern: ~/.m2/wrapper/dists/{apache-maven-,maven-mvnd--}/ 75 | if ($env:MVNW_REPOURL) { 76 | $MVNW_REPO_PATTERN = if ($USE_MVND) { "/org/apache/maven/" } else { "/maven/mvnd/" } 77 | $distributionUrl = "$env:MVNW_REPOURL$MVNW_REPO_PATTERN$($distributionUrl -replace '^.*'+$MVNW_REPO_PATTERN,'')" 78 | } 79 | $distributionUrlName = $distributionUrl -replace '^.*/','' 80 | $distributionUrlNameMain = $distributionUrlName -replace '\.[^.]*$','' -replace '-bin$','' 81 | $MAVEN_HOME_PARENT = "$HOME/.m2/wrapper/dists/$distributionUrlNameMain" 82 | if ($env:MAVEN_USER_HOME) { 83 | $MAVEN_HOME_PARENT = "$env:MAVEN_USER_HOME/wrapper/dists/$distributionUrlNameMain" 84 | } 85 | $MAVEN_HOME_NAME = ([System.Security.Cryptography.MD5]::Create().ComputeHash([byte[]][char[]]$distributionUrl) | ForEach-Object {$_.ToString("x2")}) -join '' 86 | $MAVEN_HOME = "$MAVEN_HOME_PARENT/$MAVEN_HOME_NAME" 87 | 88 | if (Test-Path -Path "$MAVEN_HOME" -PathType Container) { 89 | Write-Verbose "found existing MAVEN_HOME at $MAVEN_HOME" 90 | Write-Output "MVN_CMD=$MAVEN_HOME/bin/$MVN_CMD" 91 | exit $? 92 | } 93 | 94 | if (! $distributionUrlNameMain -or ($distributionUrlName -eq $distributionUrlNameMain)) { 95 | Write-Error "distributionUrl is not valid, must end with *-bin.zip, but found $distributionUrl" 96 | } 97 | 98 | # prepare tmp dir 99 | $TMP_DOWNLOAD_DIR_HOLDER = New-TemporaryFile 100 | $TMP_DOWNLOAD_DIR = New-Item -Itemtype Directory -Path "$TMP_DOWNLOAD_DIR_HOLDER.dir" 101 | $TMP_DOWNLOAD_DIR_HOLDER.Delete() | Out-Null 102 | trap { 103 | if ($TMP_DOWNLOAD_DIR.Exists) { 104 | try { Remove-Item $TMP_DOWNLOAD_DIR -Recurse -Force | Out-Null } 105 | catch { Write-Warning "Cannot remove $TMP_DOWNLOAD_DIR" } 106 | } 107 | } 108 | 109 | New-Item -Itemtype Directory -Path "$MAVEN_HOME_PARENT" -Force | Out-Null 110 | 111 | # Download and Install Apache Maven 112 | Write-Verbose "Couldn't find MAVEN_HOME, downloading and installing it ..." 113 | Write-Verbose "Downloading from: $distributionUrl" 114 | Write-Verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName" 115 | 116 | $webclient = New-Object System.Net.WebClient 117 | if ($env:MVNW_USERNAME -and $env:MVNW_PASSWORD) { 118 | $webclient.Credentials = New-Object System.Net.NetworkCredential($env:MVNW_USERNAME, $env:MVNW_PASSWORD) 119 | } 120 | [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 121 | $webclient.DownloadFile($distributionUrl, "$TMP_DOWNLOAD_DIR/$distributionUrlName") | Out-Null 122 | 123 | # If specified, validate the SHA-256 sum of the Maven distribution zip file 124 | $distributionSha256Sum = (Get-Content -Raw "$scriptDir/.mvn/wrapper/maven-wrapper.properties" | ConvertFrom-StringData).distributionSha256Sum 125 | if ($distributionSha256Sum) { 126 | if ($USE_MVND) { 127 | Write-Error "Checksum validation is not supported for maven-mvnd. `nPlease disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." 128 | } 129 | Import-Module $PSHOME\Modules\Microsoft.PowerShell.Utility -Function Get-FileHash 130 | if ((Get-FileHash "$TMP_DOWNLOAD_DIR/$distributionUrlName" -Algorithm SHA256).Hash.ToLower() -ne $distributionSha256Sum) { 131 | Write-Error "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised. If you updated your Maven version, you need to update the specified distributionSha256Sum property." 132 | } 133 | } 134 | 135 | # unzip and move 136 | Expand-Archive "$TMP_DOWNLOAD_DIR/$distributionUrlName" -DestinationPath "$TMP_DOWNLOAD_DIR" | Out-Null 137 | Rename-Item -Path "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" -NewName $MAVEN_HOME_NAME | Out-Null 138 | try { 139 | Move-Item -Path "$TMP_DOWNLOAD_DIR/$MAVEN_HOME_NAME" -Destination $MAVEN_HOME_PARENT | Out-Null 140 | } catch { 141 | if (! (Test-Path -Path "$MAVEN_HOME" -PathType Container)) { 142 | Write-Error "fail to move MAVEN_HOME" 143 | } 144 | } finally { 145 | try { Remove-Item $TMP_DOWNLOAD_DIR -Recurse -Force | Out-Null } 146 | catch { Write-Warning "Cannot remove $TMP_DOWNLOAD_DIR" } 147 | } 148 | 149 | Write-Output "MVN_CMD=$MAVEN_HOME/bin/$MVN_CMD" 150 | -------------------------------------------------------------------------------- /mvnw: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # ---------------------------------------------------------------------------- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # https://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # ---------------------------------------------------------------------------- 20 | 21 | # ---------------------------------------------------------------------------- 22 | # Apache Maven Wrapper startup batch script, version 3.3.2 23 | # 24 | # Optional ENV vars 25 | # ----------------- 26 | # JAVA_HOME - location of a JDK home dir, required when download maven via java source 27 | # MVNW_REPOURL - repo url base for downloading maven distribution 28 | # MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven 29 | # MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output 30 | # ---------------------------------------------------------------------------- 31 | 32 | set -euf 33 | [ "${MVNW_VERBOSE-}" != debug ] || set -x 34 | 35 | # OS specific support. 36 | native_path() { printf %s\\n "$1"; } 37 | case "$(uname)" in 38 | CYGWIN* | MINGW*) 39 | [ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")" 40 | native_path() { cygpath --path --windows "$1"; } 41 | ;; 42 | esac 43 | 44 | # set JAVACMD and JAVACCMD 45 | set_java_home() { 46 | # For Cygwin and MinGW, ensure paths are in Unix format before anything is touched 47 | if [ -n "${JAVA_HOME-}" ]; then 48 | if [ -x "$JAVA_HOME/jre/sh/java" ]; then 49 | # IBM's JDK on AIX uses strange locations for the executables 50 | JAVACMD="$JAVA_HOME/jre/sh/java" 51 | JAVACCMD="$JAVA_HOME/jre/sh/javac" 52 | else 53 | JAVACMD="$JAVA_HOME/bin/java" 54 | JAVACCMD="$JAVA_HOME/bin/javac" 55 | 56 | if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then 57 | echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2 58 | echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2 59 | return 1 60 | fi 61 | fi 62 | else 63 | JAVACMD="$( 64 | 'set' +e 65 | 'unset' -f command 2>/dev/null 66 | 'command' -v java 67 | )" || : 68 | JAVACCMD="$( 69 | 'set' +e 70 | 'unset' -f command 2>/dev/null 71 | 'command' -v javac 72 | )" || : 73 | 74 | if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then 75 | echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2 76 | return 1 77 | fi 78 | fi 79 | } 80 | 81 | # hash string like Java String::hashCode 82 | hash_string() { 83 | str="${1:-}" h=0 84 | while [ -n "$str" ]; do 85 | char="${str%"${str#?}"}" 86 | h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296)) 87 | str="${str#?}" 88 | done 89 | printf %x\\n $h 90 | } 91 | 92 | verbose() { :; } 93 | [ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; } 94 | 95 | die() { 96 | printf %s\\n "$1" >&2 97 | exit 1 98 | } 99 | 100 | trim() { 101 | # MWRAPPER-139: 102 | # Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds. 103 | # Needed for removing poorly interpreted newline sequences when running in more 104 | # exotic environments such as mingw bash on Windows. 105 | printf "%s" "${1}" | tr -d '[:space:]' 106 | } 107 | 108 | # parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties 109 | while IFS="=" read -r key value; do 110 | case "${key-}" in 111 | distributionUrl) distributionUrl=$(trim "${value-}") ;; 112 | distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;; 113 | esac 114 | done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties" 115 | [ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties" 116 | 117 | case "${distributionUrl##*/}" in 118 | maven-mvnd-*bin.*) 119 | MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ 120 | case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in 121 | *AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;; 122 | :Darwin*x86_64) distributionPlatform=darwin-amd64 ;; 123 | :Darwin*arm64) distributionPlatform=darwin-aarch64 ;; 124 | :Linux*x86_64*) distributionPlatform=linux-amd64 ;; 125 | *) 126 | echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2 127 | distributionPlatform=linux-amd64 128 | ;; 129 | esac 130 | distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip" 131 | ;; 132 | maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;; 133 | *) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;; 134 | esac 135 | 136 | # apply MVNW_REPOURL and calculate MAVEN_HOME 137 | # maven home pattern: ~/.m2/wrapper/dists/{apache-maven-,maven-mvnd--}/ 138 | [ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}" 139 | distributionUrlName="${distributionUrl##*/}" 140 | distributionUrlNameMain="${distributionUrlName%.*}" 141 | distributionUrlNameMain="${distributionUrlNameMain%-bin}" 142 | MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}" 143 | MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")" 144 | 145 | exec_maven() { 146 | unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || : 147 | exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD" 148 | } 149 | 150 | if [ -d "$MAVEN_HOME" ]; then 151 | verbose "found existing MAVEN_HOME at $MAVEN_HOME" 152 | exec_maven "$@" 153 | fi 154 | 155 | case "${distributionUrl-}" in 156 | *?-bin.zip | *?maven-mvnd-?*-?*.zip) ;; 157 | *) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;; 158 | esac 159 | 160 | # prepare tmp dir 161 | if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then 162 | clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; } 163 | trap clean HUP INT TERM EXIT 164 | else 165 | die "cannot create temp dir" 166 | fi 167 | 168 | mkdir -p -- "${MAVEN_HOME%/*}" 169 | 170 | # Download and Install Apache Maven 171 | verbose "Couldn't find MAVEN_HOME, downloading and installing it ..." 172 | verbose "Downloading from: $distributionUrl" 173 | verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName" 174 | 175 | # select .zip or .tar.gz 176 | if ! command -v unzip >/dev/null; then 177 | distributionUrl="${distributionUrl%.zip}.tar.gz" 178 | distributionUrlName="${distributionUrl##*/}" 179 | fi 180 | 181 | # verbose opt 182 | __MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR='' 183 | [ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v 184 | 185 | # normalize http auth 186 | case "${MVNW_PASSWORD:+has-password}" in 187 | '') MVNW_USERNAME='' MVNW_PASSWORD='' ;; 188 | has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;; 189 | esac 190 | 191 | if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then 192 | verbose "Found wget ... using wget" 193 | wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl" 194 | elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then 195 | verbose "Found curl ... using curl" 196 | curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl" 197 | elif set_java_home; then 198 | verbose "Falling back to use Java to download" 199 | javaSource="$TMP_DOWNLOAD_DIR/Downloader.java" 200 | targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName" 201 | cat >"$javaSource" <<-END 202 | public class Downloader extends java.net.Authenticator 203 | { 204 | protected java.net.PasswordAuthentication getPasswordAuthentication() 205 | { 206 | return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() ); 207 | } 208 | public static void main( String[] args ) throws Exception 209 | { 210 | setDefault( new Downloader() ); 211 | java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() ); 212 | } 213 | } 214 | END 215 | # For Cygwin/MinGW, switch paths to Windows format before running javac and java 216 | verbose " - Compiling Downloader.java ..." 217 | "$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java" 218 | verbose " - Running Downloader.java ..." 219 | "$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")" 220 | fi 221 | 222 | # If specified, validate the SHA-256 sum of the Maven distribution zip file 223 | if [ -n "${distributionSha256Sum-}" ]; then 224 | distributionSha256Result=false 225 | if [ "$MVN_CMD" = mvnd.sh ]; then 226 | echo "Checksum validation is not supported for maven-mvnd." >&2 227 | echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2 228 | exit 1 229 | elif command -v sha256sum >/dev/null; then 230 | if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then 231 | distributionSha256Result=true 232 | fi 233 | elif command -v shasum >/dev/null; then 234 | if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then 235 | distributionSha256Result=true 236 | fi 237 | else 238 | echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2 239 | echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2 240 | exit 1 241 | fi 242 | if [ $distributionSha256Result = false ]; then 243 | echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2 244 | echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2 245 | exit 1 246 | fi 247 | fi 248 | 249 | # unzip and move 250 | if command -v unzip >/dev/null; then 251 | unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip" 252 | else 253 | tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar" 254 | fi 255 | printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url" 256 | mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME" 257 | 258 | clean || : 259 | exec_maven "$@" 260 | -------------------------------------------------------------------------------- /src/main/resources/docs/file.json: -------------------------------------------------------------------------------- 1 | { 2 | "info": { 3 | "_postman_id": "732f0f05-b9c6-4763-b636-e4dc83c5cd11", 4 | "name": "car-dealer", 5 | "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json", 6 | "_exporter_id": "13628794", 7 | "_collection_link": "https://grey-water-607719.postman.co/workspace/My-Workspace~7c8e265d-3378-4686-836d-e08a8569baca/collection/13628794-732f0f05-b9c6-4763-b636-e4dc83c5cd11?action=share&source=collection_link&creator=13628794" 8 | }, 9 | "item": [ 10 | { 11 | "name": "auth", 12 | "item": [ 13 | { 14 | "name": "Register user", 15 | "request": { 16 | "method": "POST", 17 | "header": [], 18 | "body": { 19 | "mode": "raw", 20 | "raw": "{ \r\n \"fullName\": \"xullaaa\",\r\n \"phoneNumber\": \"+9647711500575\",\r\n \"password\":\"HeLlo@12345@dD\",\r\n \"confirmPassword\": \"HeLlo@12345@dD\"\r\n}", 21 | "options": { 22 | "raw": { 23 | "language": "json" 24 | } 25 | } 26 | }, 27 | "url": { 28 | "raw": "http://localhost:3000/api/v1/auth/register", 29 | "protocol": "http", 30 | "host": [ 31 | "localhost" 32 | ], 33 | "port": "3000", 34 | "path": [ 35 | "api", 36 | "v1", 37 | "auth", 38 | "register" 39 | ] 40 | } 41 | }, 42 | "response": [] 43 | }, 44 | { 45 | "name": "login user", 46 | "request": { 47 | "method": "POST", 48 | "header": [], 49 | "body": { 50 | "mode": "raw", 51 | "raw": "{ \r\n \"phoneNumber\": \"+9647711500575\",\r\n \"password\":\"HeLlo@12345@dD\"\r\n}", 52 | "options": { 53 | "raw": { 54 | "language": "json" 55 | } 56 | } 57 | }, 58 | "url": { 59 | "raw": "http://localhost:3000/api/v1/auth/login", 60 | "protocol": "http", 61 | "host": [ 62 | "localhost" 63 | ], 64 | "port": "3000", 65 | "path": [ 66 | "api", 67 | "v1", 68 | "auth", 69 | "login" 70 | ] 71 | } 72 | }, 73 | "response": [] 74 | } 75 | ] 76 | }, 77 | { 78 | "name": "showroom", 79 | "item": [ 80 | { 81 | "name": "Create showroom", 82 | "request": { 83 | "method": "POST", 84 | "header": [], 85 | "body": { 86 | "mode": "raw", 87 | "raw": "{\r\n \"name\": \"2dsad2\",\r\n \"phoneNumber\": \"+9647711500574\",\r\n \"email\": \"ao@gmail.com\",\r\n \"website\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"facebookLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"instagramLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"tiktokLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"snapchatLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"cityId\": \"kjh6jdt9e8clqo2bflfujk1n\",\r\n \"street\": \"street\",\r\n \"profilePicture\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"type\": \"SHOWROOM\",\r\n \"latitude\": 321,\r\n \"longitude\": 2321\r\n}", 88 | "options": { 89 | "raw": { 90 | "language": "json" 91 | } 92 | } 93 | }, 94 | "url": { 95 | "raw": "http://localhost:3000/api/v1/showrooms", 96 | "protocol": "http", 97 | "host": [ 98 | "localhost" 99 | ], 100 | "port": "3000", 101 | "path": [ 102 | "api", 103 | "v1", 104 | "showrooms" 105 | ] 106 | } 107 | }, 108 | "response": [] 109 | }, 110 | { 111 | "name": "Update showroom", 112 | "request": { 113 | "method": "PUT", 114 | "header": [], 115 | "body": { 116 | "mode": "raw", 117 | "raw": "{\r\n \"name\": \"xula22\",\r\n \"phoneNumber\": \"+9647711500576\",\r\n \"email\": \"ao@gmail.com\",\r\n \"website\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"facebookLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"instagramLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"tiktokLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"snapchatLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"governorate\": \"governorate\",\r\n \"cityId\": \"kjh6jdt9e8clqo2bflfujk1n\",\r\n \"street\": \"street2\",\r\n \"profilePicture\": \"http://localhost:3000/api/v1/showrooms\",\r\n \"type\": \"DEALERSHIP\",\r\n \"latitude\": 321,\r\n \"longitude\": 2321\r\n}", 118 | "options": { 119 | "raw": { 120 | "language": "json" 121 | } 122 | } 123 | }, 124 | "url": { 125 | "raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25", 126 | "protocol": "http", 127 | "host": [ 128 | "localhost" 129 | ], 130 | "port": "3000", 131 | "path": [ 132 | "api", 133 | "v1", 134 | "showrooms", 135 | "b16xfrg4xy1o3bpkoocftd25" 136 | ] 137 | } 138 | }, 139 | "response": [] 140 | }, 141 | { 142 | "name": "Delete showroom", 143 | "request": { 144 | "auth": { 145 | "type": "bearer", 146 | "bearer": [ 147 | { 148 | "key": "token", 149 | "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6ImdycnpmaGVtZTJsZHIxbThua3diNmp6aCIsImlhdCI6MTcxMDk4MTcxNiwiZXhwIjoxNzEwOTg1MzE2fQ.xWp52J3VcL1GN53J-vWutFBd5W1hvZo4hi1fO9MYGJ8", 150 | "type": "string" 151 | } 152 | ] 153 | }, 154 | "method": "DELETE", 155 | "header": [], 156 | "url": { 157 | "raw": "http://localhost:3000/api/v1/showrooms/pni90mnecrgfn0r5jyp5x1dv", 158 | "protocol": "http", 159 | "host": [ 160 | "localhost" 161 | ], 162 | "port": "3000", 163 | "path": [ 164 | "api", 165 | "v1", 166 | "showrooms", 167 | "pni90mnecrgfn0r5jyp5x1dv" 168 | ] 169 | } 170 | }, 171 | "response": [] 172 | }, 173 | { 174 | "name": "My showrooms", 175 | "request": { 176 | "auth": { 177 | "type": "bearer", 178 | "bearer": [ 179 | { 180 | "key": "token", 181 | "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6ImM1ZWZnbzJxMmJjNXBtYmNhaXJ3bmlvdiIsInBob25lTnVtYmVyIjoiKzk2NDc3MTE1MDA1NzQiLCJmdWxsTmFtZSI6Inh1bGxhYWEiLCJsZXZlbCI6IlVTRVIiLCJpYXQiOjE3MTI2MDg3ODEsImV4cCI6MTcxMjYxMjM4MX0.P2GzJSx_qzYtNmrdzXCy_lM4dtbM7o0Z2H6psArsb6g", 182 | "type": "string" 183 | } 184 | ] 185 | }, 186 | "method": "GET", 187 | "header": [], 188 | "url": { 189 | "raw": "http://localhost:3000/api/v1/showrooms/me/all", 190 | "protocol": "http", 191 | "host": [ 192 | "localhost" 193 | ], 194 | "port": "3000", 195 | "path": [ 196 | "api", 197 | "v1", 198 | "showrooms", 199 | "me", 200 | "all" 201 | ] 202 | } 203 | }, 204 | "response": [] 205 | }, 206 | { 207 | "name": "Get All showrooms", 208 | "request": { 209 | "method": "GET", 210 | "header": [], 211 | "url": { 212 | "raw": "http://localhost:3000/api/v1/showrooms", 213 | "protocol": "http", 214 | "host": [ 215 | "localhost" 216 | ], 217 | "port": "3000", 218 | "path": [ 219 | "api", 220 | "v1", 221 | "showrooms" 222 | ] 223 | } 224 | }, 225 | "response": [] 226 | }, 227 | { 228 | "name": "Get a single showroom", 229 | "request": { 230 | "method": "GET", 231 | "header": [], 232 | "url": { 233 | "raw": "http://localhost:3000/api/v1/showrooms/ic8nq13o5ns9h41cdbsnoh3z", 234 | "protocol": "http", 235 | "host": [ 236 | "localhost" 237 | ], 238 | "port": "3000", 239 | "path": [ 240 | "api", 241 | "v1", 242 | "showrooms", 243 | "ic8nq13o5ns9h41cdbsnoh3z" 244 | ] 245 | } 246 | }, 247 | "response": [] 248 | } 249 | ] 250 | }, 251 | { 252 | "name": "admin", 253 | "item": [ 254 | { 255 | "name": "metadata", 256 | "item": [ 257 | { 258 | "name": "create city", 259 | "request": { 260 | "auth": { 261 | "type": "bearer", 262 | "bearer": [ 263 | { 264 | "key": "token", 265 | "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6ImM1ZWZnbzJxMmJjNXBtYmNhaXJ3bmlvdiIsInBob25lTnVtYmVyIjoiKzk2NDc3MTE1MDA1NzQiLCJmdWxsTmFtZSI6Inh1bGxhYWEiLCJsZXZlbCI6IlVTRVIiLCJpYXQiOjE3MTE4MzA2ODcsImV4cCI6MTcxMTgzNDI4N30.n0vux5Hr_cTwbl9Ps1P2xea9CHVm0oc5WQK35KWA_KA", 266 | "type": "string" 267 | } 268 | ] 269 | }, 270 | "method": "POST", 271 | "header": [], 272 | "body": { 273 | "mode": "raw", 274 | "raw": "{\r\n \"name\": \"kalar\",\r\n \"governorateId\": \"ilo29oos241vw755fi6oomnq\"\r\n}", 275 | "options": { 276 | "raw": { 277 | "language": "json" 278 | } 279 | } 280 | }, 281 | "url": { 282 | "raw": "http://localhost:3000/api/v1/admin/metadata/city", 283 | "protocol": "http", 284 | "host": [ 285 | "localhost" 286 | ], 287 | "port": "3000", 288 | "path": [ 289 | "api", 290 | "v1", 291 | "admin", 292 | "metadata", 293 | "city" 294 | ] 295 | } 296 | }, 297 | "response": [] 298 | }, 299 | { 300 | "name": "Create governorate", 301 | "request": { 302 | "method": "POST", 303 | "header": [], 304 | "body": { 305 | "mode": "raw", 306 | "raw": "{\r\n \"name\": \"sulaimaniyah\"\r\n}", 307 | "options": { 308 | "raw": { 309 | "language": "json" 310 | } 311 | } 312 | }, 313 | "url": { 314 | "raw": "http://localhost:3000/api/v1/admin/metadata/governorate", 315 | "protocol": "http", 316 | "host": [ 317 | "localhost" 318 | ], 319 | "port": "3000", 320 | "path": [ 321 | "api", 322 | "v1", 323 | "admin", 324 | "metadata", 325 | "governorate" 326 | ] 327 | } 328 | }, 329 | "response": [] 330 | } 331 | ] 332 | } 333 | ] 334 | }, 335 | { 336 | "name": "public", 337 | "item": [ 338 | { 339 | "name": "metadata", 340 | "item": [ 341 | { 342 | "name": "Get all cities", 343 | "request": { 344 | "method": "GET", 345 | "header": [], 346 | "url": { 347 | "raw": "http://localhost:3000/api/v1/public/metadata/cities", 348 | "protocol": "http", 349 | "host": [ 350 | "localhost" 351 | ], 352 | "port": "3000", 353 | "path": [ 354 | "api", 355 | "v1", 356 | "public", 357 | "metadata", 358 | "cities" 359 | ] 360 | } 361 | }, 362 | "response": [] 363 | }, 364 | { 365 | "name": "Get All governorates", 366 | "request": { 367 | "method": "GET", 368 | "header": [], 369 | "url": { 370 | "raw": "http://localhost:3000/api/v1/public/metadata/governorates", 371 | "protocol": "http", 372 | "host": [ 373 | "localhost" 374 | ], 375 | "port": "3000", 376 | "path": [ 377 | "api", 378 | "v1", 379 | "public", 380 | "metadata", 381 | "governorates" 382 | ] 383 | } 384 | }, 385 | "response": [] 386 | } 387 | ] 388 | } 389 | ] 390 | }, 391 | { 392 | "name": "car", 393 | "item": [ 394 | { 395 | "name": "save car", 396 | "request": { 397 | "method": "POST", 398 | "header": [], 399 | "body": { 400 | "mode": "raw", 401 | "raw": "{\r\n \"brand\" : \"Toyota\",\r\n \"model\" : \"Camry\",\r\n \"year\" : 2023,\r\n \"trim\" : \"SE+\",\r\n \"transmission\" : \"gasoline\",\r\n \"color\": \"red\",\r\n \"fuel\" : \"gasoline\",\r\n \"plateType\" : \"vehicle\",\r\n \"plateCityId\": \"kjh6jdt9e8clqo2bflfujk1n\",\r\n \"importCountry\": \"usa\",\r\n \"price\" : 250000,\r\n \"currency\" : \"USD\",\r\n \"priceHidden\" : false,\r\n \"phoneNumber\" : \"+9647711500573\",\r\n \"inspectionDocumentLink\": \"https://google.com\",\r\n \"damages\":\"none\",\r\n \"damageType\": \"Clean\"\r\n}", 402 | "options": { 403 | "raw": { 404 | "language": "json" 405 | } 406 | } 407 | }, 408 | "url": { 409 | "raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25/cars", 410 | "protocol": "http", 411 | "host": [ 412 | "localhost" 413 | ], 414 | "port": "3000", 415 | "path": [ 416 | "api", 417 | "v1", 418 | "showrooms", 419 | "b16xfrg4xy1o3bpkoocftd25", 420 | "cars" 421 | ] 422 | } 423 | }, 424 | "response": [] 425 | }, 426 | { 427 | "name": "all my cars", 428 | "request": { 429 | "method": "GET", 430 | "header": [], 431 | "url": { 432 | "raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25/cars/me/all", 433 | "protocol": "http", 434 | "host": [ 435 | "localhost" 436 | ], 437 | "port": "3000", 438 | "path": [ 439 | "api", 440 | "v1", 441 | "showrooms", 442 | "b16xfrg4xy1o3bpkoocftd25", 443 | "cars", 444 | "me", 445 | "all" 446 | ] 447 | } 448 | }, 449 | "response": [] 450 | }, 451 | { 452 | "name": "get my car by id", 453 | "request": { 454 | "method": "GET", 455 | "header": [], 456 | "url": { 457 | "raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25/cars/qz1t6580ogo8f1kc0jo4rram", 458 | "protocol": "http", 459 | "host": [ 460 | "localhost" 461 | ], 462 | "port": "3000", 463 | "path": [ 464 | "api", 465 | "v1", 466 | "showrooms", 467 | "b16xfrg4xy1o3bpkoocftd25", 468 | "cars", 469 | "qz1t6580ogo8f1kc0jo4rram" 470 | ] 471 | } 472 | }, 473 | "response": [] 474 | }, 475 | { 476 | "name": "delete a car from a show room", 477 | "request": { 478 | "method": "DELETE", 479 | "header": [], 480 | "url": { 481 | "raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25/cars/qz1t6580ogo8f1kc0jo4rram", 482 | "protocol": "http", 483 | "host": [ 484 | "localhost" 485 | ], 486 | "port": "3000", 487 | "path": [ 488 | "api", 489 | "v1", 490 | "showrooms", 491 | "b16xfrg4xy1o3bpkoocftd25", 492 | "cars", 493 | "qz1t6580ogo8f1kc0jo4rram" 494 | ] 495 | } 496 | }, 497 | "response": [] 498 | }, 499 | { 500 | "name": "upload car img", 501 | "request": { 502 | "method": "POST", 503 | "header": [], 504 | "body": { 505 | "mode": "formdata", 506 | "formdata": [ 507 | { 508 | "key": "image", 509 | "type": "file", 510 | "src": "/C:/Users/aland/Downloads/Stamp.JPG" 511 | } 512 | ] 513 | }, 514 | "url": { 515 | "raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25/cars/hg91wrvv52y76vnn0h30461p/images", 516 | "protocol": "http", 517 | "host": [ 518 | "localhost" 519 | ], 520 | "port": "3000", 521 | "path": [ 522 | "api", 523 | "v1", 524 | "showrooms", 525 | "b16xfrg4xy1o3bpkoocftd25", 526 | "cars", 527 | "hg91wrvv52y76vnn0h30461p", 528 | "images" 529 | ] 530 | } 531 | }, 532 | "response": [] 533 | } 534 | ] 535 | } 536 | ], 537 | "auth": { 538 | "type": "bearer", 539 | "bearer": [ 540 | { 541 | "key": "token", 542 | "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6ImRwZWk1ZWkzN2d1aG52ZzQ2bjBiN3lwdSIsInBob25lTnVtYmVyIjoiKzk2NDc3MTE1MDA1NzUiLCJmdWxsTmFtZSI6Inh1bGxhYWEiLCJsZXZlbCI6IlVTRVIiLCJpYXQiOjE3MTQ5Mjg4MzUsImV4cCI6MTcxNDkzMjQzNX0.Pj9uk_wJsPE21R_tg0C16XZjFnJgsI2lbAaX6yPD3BQ", 543 | "type": "string" 544 | } 545 | ] 546 | }, 547 | "event": [ 548 | { 549 | "listen": "prerequest", 550 | "script": { 551 | "type": "text/javascript", 552 | "packages": {}, 553 | "exec": [ 554 | "" 555 | ] 556 | } 557 | }, 558 | { 559 | "listen": "test", 560 | "script": { 561 | "type": "text/javascript", 562 | "packages": {}, 563 | "exec": [ 564 | "" 565 | ] 566 | } 567 | } 568 | ] 569 | } -------------------------------------------------------------------------------- /src/main/resources/docs/initial-design.md: -------------------------------------------------------------------------------- 1 | # ragscan initial design doc 2 | 3 | ## Objective 4 | 5 | The overall goal of this project is to test the application of Retreival Augmented Generation (RAG) to the reconnaissance phase of pentesting. RAG has shown 6 | to be effective in giving Large Language Models (LLM's) dynamic information in which it is not trained on for use to answer the users prompts and instructions. 7 | 8 | ## Overview 9 | This document will show the high level design of the initial approach taken for this tool. 10 | 11 | ## Processing of File and Document Input 12 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/f56a6f25-44bc-4d77-88f5-318fdfd0865c) 13 | 14 | ### Scrape Web Content 15 | As of now, there is not any plans to add any tooling that will do the actual scraping of web content as it would put less focus on the core functionality 16 | of this project and there are other tools that can do this much better and have had more project maturity. Many of these tools can be found in https://github.com/BruceDone/awesome-crawler. 17 | 18 | ### Input/Scraped Web Content 19 | The expected input to the tool are any pages, code, or information that was gathered through other tools and are stored in a directory in their original files and file 20 | structure. This ensures that there is tracking as to where information that the LLM processes is from and is easily identifiable for the user if they wish to manually inspect 21 | the identified file themselves after tool usage. 22 | 23 | ### Initial Loading of Content 24 | On execution of the tool, it will load all of the files in the specified directory, open them, convert them into standardized document objects that a vector 25 | database will be able to identify and use. 26 | 27 | ### Document to Embeddings Conversion 28 | After each of the files are loaded as documents, they can be converted into [embeddings](https://www.cloudflare.com/learning/ai/what-are-embeddings/) which is a unique 29 | representation of that specific object that can be used for more complex processing. This conversion is done through the use of [text embedding models](https://medium.com/@minh.hoque/text-embedding-models-an-insightful-dive-759ea53576f5) thats sole purpose is to create the vectorized representation of chunks of text. 30 | 31 | ### Storing the Embeddings 32 | After all of the previous processing is done, the generated embeddings can be stored within a [vector database](https://aws.amazon.com/what-is/vector-databases/) that's 33 | specialized in the ability to detect the similarity of object and content based on the space between their vector point location. This is an important prerequisite to the 34 | success of this RAG approach where there can large amounts of documents that together would be larger than the token limit of current LLM's. 35 | 36 | ## Standard Flow on Tool Usage on User Prompt Input 37 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/472ee53d-7ccb-4f96-bd56-d1e81bd58c05) 38 | 39 | ### User Prompt 40 | The user prompt is the other input to this tool that is highly responsible for the output in which the vector database returns and for the LLM to process. An example prompt that 41 | a pentester may have when having a large set of documents are are things like "is there an admin login page." or "what is the version of wordpress that this website is running on?". 42 | After this user prompt is gathered, the tool can start it execution 43 | 44 | ### User Prompt Input Conversion to Embedding 45 | Because the documents gathered are now stored in the vector database alongside their embeddings, we need to convert the user input prompt into an embedding as well to calculate 46 | similarity. This is done through the use of an embedding model just as explained above with the conversion of the documents into embeddings 47 | 48 | ### Semantic Search Using Embeddings 49 | Once the user prompt input is converted into embeddings, it is now possible to use [semantic search](https://www.elastic.co/what-is/semantic-search) to find the relevant documents that are most similar to the user input prompt. 50 | 51 | ### Semantic Search Results 52 | Once the semantic search has been completed using the embeddings, the output should be the most relevant documents in relation to the users input prompt that are ranked on the 53 | calculated relevancy to it. For example, if a user is asking for if there is an admin login page, the top result would most likely be the admin login page document, with lower 54 | ranked results possibly being other login pages or mentions of admins. 55 | 56 | ### Construct the Prompt to Feed to the LLM 57 | After gathering the most probable content that matches a users input, we can not just feed that directly to the LLM without any other context as it would not know exactly what the 58 | user wants. There are two other parts that will need to be added to the prompt to the LLM, one being the user input prompt, and another being a system prompt. The user input prompt 59 | is to guide the LLM to do the exact task that the user would like, such as identifying if there is an admin page. The system prompt is used to guide the LLM to answer in a specific 60 | manner and to understand the context that the user is conducting a pentest and may want a more concise response. Putting these three things together creates the full prompt that will 61 | be fed to the LLM. 62 | 63 | ### Feeding Prompt to LLM 64 | The constructed prompt is then fed to the LLM for it to summarize and provide specific context to the user. Depending on the system prompt given to the LLM, its response could be just the fact that it identified a possible result, or it can go further and explain why it things that it is or is not what the user is looking for. This is the most unpredictable step but 65 | highly valuable given that the prompt is correct and no alignment in the LLM renders the results unusable. 66 | 67 | ### Return the Output to the User 68 | After the LLM has done what it was tasked with doing, the generated response should return to the user alongside the location or identifier of the files used as input as part of the prompt to the LLM. This could be the location of the admin document in the directory that was used to feed documents into the vector database, or even more verbose identifiers such as line numbers or file hashes. This output should give the output that the user was looking for and making the recon process much easier. 69 | 70 | 71 | # ragscan initial design doc 72 | 73 | ## Objective 74 | 75 | The overall goal of this project is to test the application of Retreival Augmented Generation (RAG) to the reconnaissance phase of pentesting. RAG has shown 76 | to be effective in giving Large Language Models (LLM's) dynamic information in which it is not trained on for use to answer the users prompts and instructions. 77 | 78 | ## Overview 79 | This document will show the high level design of the initial approach taken for this tool. 80 | 81 | ## Processing of File and Document Input 82 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/f56a6f25-44bc-4d77-88f5-318fdfd0865c) 83 | 84 | ### Scrape Web Content 85 | As of now, there is not any plans to add any tooling that will do the actual scraping of web content as it would put less focus on the core functionality 86 | of this project and there are other tools that can do this much better and have had more project maturity. Many of these tools can be found in https://github.com/BruceDone/awesome-crawler. 87 | 88 | ### Input/Scraped Web Content 89 | The expected input to the tool are any pages, code, or information that was gathered through other tools and are stored in a directory in their original files and file 90 | structure. This ensures that there is tracking as to where information that the LLM processes is from and is easily identifiable for the user if they wish to manually inspect 91 | the identified file themselves after tool usage. 92 | 93 | ### Initial Loading of Content 94 | On execution of the tool, it will load all of the files in the specified directory, open them, convert them into standardized document objects that a vector 95 | database will be able to identify and use. 96 | 97 | ### Document to Embeddings Conversion 98 | After each of the files are loaded as documents, they can be converted into [embeddings](https://www.cloudflare.com/learning/ai/what-are-embeddings/) which is a unique 99 | representation of that specific object that can be used for more complex processing. This conversion is done through the use of [text embedding models](https://medium.com/@minh.hoque/text-embedding-models-an-insightful-dive-759ea53576f5) thats sole purpose is to create the vectorized representation of chunks of text. 100 | 101 | ### Storing the Embeddings 102 | After all of the previous processing is done, the generated embeddings can be stored within a [vector database](https://aws.amazon.com/what-is/vector-databases/) that's 103 | specialized in the ability to detect the similarity of object and content based on the space between their vector point location. This is an important prerequisite to the 104 | success of this RAG approach where there can large amounts of documents that together would be larger than the token limit of current LLM's. 105 | 106 | ## Standard Flow on Tool Usage on User Prompt Input 107 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/472ee53d-7ccb-4f96-bd56-d1e81bd58c05) 108 | 109 | ### User Prompt 110 | The user prompt is the other input to this tool that is highly responsible for the output in which the vector database returns and for the LLM to process. An example prompt that 111 | a pentester may have when having a large set of documents are are things like "is there an admin login page." or "what is the version of wordpress that this website is running on?". 112 | After this user prompt is gathered, the tool can start it execution 113 | 114 | ### User Prompt Input Conversion to Embedding 115 | Because the documents gathered are now stored in the vector database alongside their embeddings, we need to convert the user input prompt into an embedding as well to calculate 116 | similarity. This is done through the use of an embedding model just as explained above with the conversion of the documents into embeddings 117 | 118 | ### Semantic Search Using Embeddings 119 | Once the user prompt input is converted into embeddings, it is now possible to use [semantic search](https://www.elastic.co/what-is/semantic-search) to find the relevant documents that are most similar to the user input prompt. 120 | 121 | ### Semantic Search Results 122 | Once the semantic search has been completed using the embeddings, the output should be the most relevant documents in relation to the users input prompt that are ranked on the 123 | calculated relevancy to it. For example, if a user is asking for if there is an admin login page, the top result would most likely be the admin login page document, with lower 124 | ranked results possibly being other login pages or mentions of admins. 125 | 126 | ### Construct the Prompt to Feed to the LLM 127 | After gathering the most probable content that matches a users input, we can not just feed that directly to the LLM without any other context as it would not know exactly what the 128 | user wants. There are two other parts that will need to be added to the prompt to the LLM, one being the user input prompt, and another being a system prompt. The user input prompt 129 | is to guide the LLM to do the exact task that the user would like, such as identifying if there is an admin page. The system prompt is used to guide the LLM to answer in a specific 130 | manner and to understand the context that the user is conducting a pentest and may want a more concise response. Putting these three things together creates the full prompt that will 131 | be fed to the LLM. 132 | 133 | ### Feeding Prompt to LLM 134 | The constructed prompt is then fed to the LLM for it to summarize and provide specific context to the user. Depending on the system prompt given to the LLM, its response could be just the fact that it identified a possible result, or it can go further and explain why it things that it is or is not what the user is looking for. This is the most unpredictable step but 135 | highly valuable given that the prompt is correct and no alignment in the LLM renders the results unusable. 136 | 137 | ### Return the Output to the User 138 | After the LLM has done what it was tasked with doing, the generated response should return to the user alongside the location or identifier of the files used as input as part of the prompt to the LLM. This could be the location of the admin document in the directory that was used to feed documents into the vector database, or even more verbose identifiers such as line numbers or file hashes. This output should give the output that the user was looking for and making the recon process much easier. 139 | 140 | # ragscan initial design doc 141 | 142 | ## Objective 143 | 144 | The overall goal of this project is to test the application of Retreival Augmented Generation (RAG) to the reconnaissance phase of pentesting. RAG has shown 145 | to be effective in giving Large Language Models (LLM's) dynamic information in which it is not trained on for use to answer the users prompts and instructions. 146 | 147 | ## Overview 148 | This document will show the high level design of the initial approach taken for this tool. 149 | 150 | ## Processing of File and Document Input 151 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/f56a6f25-44bc-4d77-88f5-318fdfd0865c) 152 | 153 | ### Scrape Web Content 154 | As of now, there is not any plans to add any tooling that will do the actual scraping of web content as it would put less focus on the core functionality 155 | of this project and there are other tools that can do this much better and have had more project maturity. Many of these tools can be found in https://github.com/BruceDone/awesome-crawler. 156 | 157 | ### Input/Scraped Web Content 158 | The expected input to the tool are any pages, code, or information that was gathered through other tools and are stored in a directory in their original files and file 159 | structure. This ensures that there is tracking as to where information that the LLM processes is from and is easily identifiable for the user if they wish to manually inspect 160 | the identified file themselves after tool usage. 161 | 162 | ### Initial Loading of Content 163 | On execution of the tool, it will load all of the files in the specified directory, open them, convert them into standardized document objects that a vector 164 | database will be able to identify and use. 165 | 166 | ### Document to Embeddings Conversion 167 | After each of the files are loaded as documents, they can be converted into [embeddings](https://www.cloudflare.com/learning/ai/what-are-embeddings/) which is a unique 168 | representation of that specific object that can be used for more complex processing. This conversion is done through the use of [text embedding models](https://medium.com/@minh.hoque/text-embedding-models-an-insightful-dive-759ea53576f5) thats sole purpose is to create the vectorized representation of chunks of text. 169 | 170 | ### Storing the Embeddings 171 | After all of the previous processing is done, the generated embeddings can be stored within a [vector database](https://aws.amazon.com/what-is/vector-databases/) that's 172 | specialized in the ability to detect the similarity of object and content based on the space between their vector point location. This is an important prerequisite to the 173 | success of this RAG approach where there can large amounts of documents that together would be larger than the token limit of current LLM's. 174 | 175 | ## Standard Flow on Tool Usage on User Prompt Input 176 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/472ee53d-7ccb-4f96-bd56-d1e81bd58c05) 177 | 178 | ### User Prompt 179 | The user prompt is the other input to this tool that is highly responsible for the output in which the vector database returns and for the LLM to process. An example prompt that 180 | a pentester may have when having a large set of documents are are things like "is there an admin login page." or "what is the version of wordpress that this website is running on?". 181 | After this user prompt is gathered, the tool can start it execution 182 | 183 | ### User Prompt Input Conversion to Embedding 184 | Because the documents gathered are now stored in the vector database alongside their embeddings, we need to convert the user input prompt into an embedding as well to calculate 185 | similarity. This is done through the use of an embedding model just as explained above with the conversion of the documents into embeddings 186 | 187 | ### Semantic Search Using Embeddings 188 | Once the user prompt input is converted into embeddings, it is now possible to use [semantic search](https://www.elastic.co/what-is/semantic-search) to find the relevant documents that are most similar to the user input prompt. 189 | 190 | ### Semantic Search Results 191 | Once the semantic search has been completed using the embeddings, the output should be the most relevant documents in relation to the users input prompt that are ranked on the 192 | calculated relevancy to it. For example, if a user is asking for if there is an admin login page, the top result would most likely be the admin login page document, with lower 193 | ranked results possibly being other login pages or mentions of admins. 194 | 195 | ### Construct the Prompt to Feed to the LLM 196 | After gathering the most probable content that matches a users input, we can not just feed that directly to the LLM without any other context as it would not know exactly what the 197 | user wants. There are two other parts that will need to be added to the prompt to the LLM, one being the user input prompt, and another being a system prompt. The user input prompt 198 | is to guide the LLM to do the exact task that the user would like, such as identifying if there is an admin page. The system prompt is used to guide the LLM to answer in a specific 199 | manner and to understand the context that the user is conducting a pentest and may want a more concise response. Putting these three things together creates the full prompt that will 200 | be fed to the LLM. 201 | 202 | ### Feeding Prompt to LLM 203 | The constructed prompt is then fed to the LLM for it to summarize and provide specific context to the user. Depending on the system prompt given to the LLM, its response could be just the fact that it identified a possible result, or it can go further and explain why it things that it is or is not what the user is looking for. This is the most unpredictable step but 204 | highly valuable given that the prompt is correct and no alignment in the LLM renders the results unusable. 205 | 206 | ### Return the Output to the User 207 | After the LLM has done what it was tasked with doing, the generated response should return to the user alongside the location or identifier of the files used as input as part of the prompt to the LLM. This could be the location of the admin document in the directory that was used to feed documents into the vector database, or even more verbose identifiers such as line numbers or file hashes. This output should give the output that the user was looking for and making the recon process much easier. 208 | 209 | # ragscan initial design doc 210 | 211 | ## Objective 212 | 213 | The overall goal of this project is to test the application of Retreival Augmented Generation (RAG) to the reconnaissance phase of pentesting. RAG has shown 214 | to be effective in giving Large Language Models (LLM's) dynamic information in which it is not trained on for use to answer the users prompts and instructions. 215 | 216 | ## Overview 217 | This document will show the high level design of the initial approach taken for this tool. 218 | 219 | ## Processing of File and Document Input 220 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/f56a6f25-44bc-4d77-88f5-318fdfd0865c) 221 | 222 | ### Scrape Web Content 223 | As of now, there is not any plans to add any tooling that will do the actual scraping of web content as it would put less focus on the core functionality 224 | of this project and there are other tools that can do this much better and have had more project maturity. Many of these tools can be found in https://github.com/BruceDone/awesome-crawler. 225 | 226 | ### Input/Scraped Web Content 227 | The expected input to the tool are any pages, code, or information that was gathered through other tools and are stored in a directory in their original files and file 228 | structure. This ensures that there is tracking as to where information that the LLM processes is from and is easily identifiable for the user if they wish to manually inspect 229 | the identified file themselves after tool usage. 230 | 231 | ### Initial Loading of Content 232 | On execution of the tool, it will load all of the files in the specified directory, open them, convert them into standardized document objects that a vector 233 | database will be able to identify and use. 234 | 235 | ### Document to Embeddings Conversion 236 | After each of the files are loaded as documents, they can be converted into [embeddings](https://www.cloudflare.com/learning/ai/what-are-embeddings/) which is a unique 237 | representation of that specific object that can be used for more complex processing. This conversion is done through the use of [text embedding models](https://medium.com/@minh.hoque/text-embedding-models-an-insightful-dive-759ea53576f5) thats sole purpose is to create the vectorized representation of chunks of text. 238 | 239 | ### Storing the Embeddings 240 | After all of the previous processing is done, the generated embeddings can be stored within a [vector database](https://aws.amazon.com/what-is/vector-databases/) that's 241 | specialized in the ability to detect the similarity of object and content based on the space between their vector point location. This is an important prerequisite to the 242 | success of this RAG approach where there can large amounts of documents that together would be larger than the token limit of current LLM's. 243 | 244 | ## Standard Flow on Tool Usage on User Prompt Input 245 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/472ee53d-7ccb-4f96-bd56-d1e81bd58c05) 246 | 247 | ### User Prompt 248 | The user prompt is the other input to this tool that is highly responsible for the output in which the vector database returns and for the LLM to process. An example prompt that 249 | a pentester may have when having a large set of documents are are things like "is there an admin login page." or "what is the version of wordpress that this website is running on?". 250 | After this user prompt is gathered, the tool can start it execution 251 | 252 | ### User Prompt Input Conversion to Embedding 253 | Because the documents gathered are now stored in the vector database alongside their embeddings, we need to convert the user input prompt into an embedding as well to calculate 254 | similarity. This is done through the use of an embedding model just as explained above with the conversion of the documents into embeddings 255 | 256 | ### Semantic Search Using Embeddings 257 | Once the user prompt input is converted into embeddings, it is now possible to use [semantic search](https://www.elastic.co/what-is/semantic-search) to find the relevant documents that are most similar to the user input prompt. 258 | 259 | ### Semantic Search Results 260 | Once the semantic search has been completed using the embeddings, the output should be the most relevant documents in relation to the users input prompt that are ranked on the 261 | calculated relevancy to it. For example, if a user is asking for if there is an admin login page, the top result would most likely be the admin login page document, with lower 262 | ranked results possibly being other login pages or mentions of admins. 263 | 264 | ### Construct the Prompt to Feed to the LLM 265 | After gathering the most probable content that matches a users input, we can not just feed that directly to the LLM without any other context as it would not know exactly what the 266 | user wants. There are two other parts that will need to be added to the prompt to the LLM, one being the user input prompt, and another being a system prompt. The user input prompt 267 | is to guide the LLM to do the exact task that the user would like, such as identifying if there is an admin page. The system prompt is used to guide the LLM to answer in a specific 268 | manner and to understand the context that the user is conducting a pentest and may want a more concise response. Putting these three things together creates the full prompt that will 269 | be fed to the LLM. 270 | 271 | ### Feeding Prompt to LLM 272 | The constructed prompt is then fed to the LLM for it to summarize and provide specific context to the user. Depending on the system prompt given to the LLM, its response could be just the fact that it identified a possible result, or it can go further and explain why it things that it is or is not what the user is looking for. This is the most unpredictable step but 273 | highly valuable given that the prompt is correct and no alignment in the LLM renders the results unusable. 274 | 275 | ### Return the Output to the User 276 | After the LLM has done what it was tasked with doing, the generated response should return to the user alongside the location or identifier of the files used as input as part of the prompt to the LLM. This could be the location of the admin document in the directory that was used to feed documents into the vector database, or even more verbose identifiers such as line numbers or file hashes. This output should give the output that the user was looking for and making the recon process much easier. 277 | 278 | # ragscan initial design doc 279 | 280 | ## Objective 281 | 282 | The overall goal of this project is to test the application of Retreival Augmented Generation (RAG) to the reconnaissance phase of pentesting. RAG has shown 283 | to be effective in giving Large Language Models (LLM's) dynamic information in which it is not trained on for use to answer the users prompts and instructions. 284 | 285 | ## Overview 286 | This document will show the high level design of the initial approach taken for this tool. 287 | 288 | ## Processing of File and Document Input 289 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/f56a6f25-44bc-4d77-88f5-318fdfd0865c) 290 | 291 | ### Scrape Web Content 292 | As of now, there is not any plans to add any tooling that will do the actual scraping of web content as it would put less focus on the core functionality 293 | of this project and there are other tools that can do this much better and have had more project maturity. Many of these tools can be found in https://github.com/BruceDone/awesome-crawler. 294 | 295 | ### Input/Scraped Web Content 296 | The expected input to the tool are any pages, code, or information that was gathered through other tools and are stored in a directory in their original files and file 297 | structure. This ensures that there is tracking as to where information that the LLM processes is from and is easily identifiable for the user if they wish to manually inspect 298 | the identified file themselves after tool usage. 299 | 300 | ### Initial Loading of Content 301 | On execution of the tool, it will load all of the files in the specified directory, open them, convert them into standardized document objects that a vector 302 | database will be able to identify and use. 303 | 304 | ### Document to Embeddings Conversion 305 | After each of the files are loaded as documents, they can be converted into [embeddings](https://www.cloudflare.com/learning/ai/what-are-embeddings/) which is a unique 306 | representation of that specific object that can be used for more complex processing. This conversion is done through the use of [text embedding models](https://medium.com/@minh.hoque/text-embedding-models-an-insightful-dive-759ea53576f5) thats sole purpose is to create the vectorized representation of chunks of text. 307 | 308 | ### Storing the Embeddings 309 | After all of the previous processing is done, the generated embeddings can be stored within a [vector database](https://aws.amazon.com/what-is/vector-databases/) that's 310 | specialized in the ability to detect the similarity of object and content based on the space between their vector point location. This is an important prerequisite to the 311 | success of this RAG approach where there can large amounts of documents that together would be larger than the token limit of current LLM's. 312 | 313 | ## Standard Flow on Tool Usage on User Prompt Input 314 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/472ee53d-7ccb-4f96-bd56-d1e81bd58c05) 315 | 316 | ### User Prompt 317 | The user prompt is the other input to this tool that is highly responsible for the output in which the vector database returns and for the LLM to process. An example prompt that 318 | a pentester may have when having a large set of documents are are things like "is there an admin login page." or "what is the version of wordpress that this website is running on?". 319 | After this user prompt is gathered, the tool can start it execution 320 | 321 | ### User Prompt Input Conversion to Embedding 322 | Because the documents gathered are now stored in the vector database alongside their embeddings, we need to convert the user input prompt into an embedding as well to calculate 323 | similarity. This is done through the use of an embedding model just as explained above with the conversion of the documents into embeddings 324 | 325 | ### Semantic Search Using Embeddings 326 | Once the user prompt input is converted into embeddings, it is now possible to use [semantic search](https://www.elastic.co/what-is/semantic-search) to find the relevant documents that are most similar to the user input prompt. 327 | 328 | ### Semantic Search Results 329 | Once the semantic search has been completed using the embeddings, the output should be the most relevant documents in relation to the users input prompt that are ranked on the 330 | calculated relevancy to it. For example, if a user is asking for if there is an admin login page, the top result would most likely be the admin login page document, with lower 331 | ranked results possibly being other login pages or mentions of admins. 332 | 333 | ### Construct the Prompt to Feed to the LLM 334 | After gathering the most probable content that matches a users input, we can not just feed that directly to the LLM without any other context as it would not know exactly what the 335 | user wants. There are two other parts that will need to be added to the prompt to the LLM, one being the user input prompt, and another being a system prompt. The user input prompt 336 | is to guide the LLM to do the exact task that the user would like, such as identifying if there is an admin page. The system prompt is used to guide the LLM to answer in a specific 337 | manner and to understand the context that the user is conducting a pentest and may want a more concise response. Putting these three things together creates the full prompt that will 338 | be fed to the LLM. 339 | 340 | ### Feeding Prompt to LLM 341 | The constructed prompt is then fed to the LLM for it to summarize and provide specific context to the user. Depending on the system prompt given to the LLM, its response could be just the fact that it identified a possible result, or it can go further and explain why it things that it is or is not what the user is looking for. This is the most unpredictable step but 342 | highly valuable given that the prompt is correct and no alignment in the LLM renders the results unusable. 343 | 344 | ### Return the Output to the User 345 | After the LLM has done what it was tasked with doing, the generated response should return to the user alongside the location or identifier of the files used as input as part of the prompt to the LLM. This could be the location of the admin document in the directory that was used to feed documents into the vector database, or even more verbose identifiers such as line numbers or file hashes. This output should give the output that the user was looking for and making the recon process much easier. 346 | --------------------------------------------------------------------------------