├── src
    ├── test
    │   └── java
    │   │   └── lunatix
    │   │       └── ragscan
    │   │           └── RagscanApplicationTests.java
    └── main
    │   ├── java
    │       └── lunatix
    │       │   └── ragscan
    │       │       ├── loader
    │       │           ├── FileLoader.java
    │       │           ├── FileLoaderFactory.java
    │       │           ├── JsonFileLoader.java
    │       │           ├── TextFileLoader.java
    │       │           ├── OtherFileLoader.java
    │       │           ├── FileType.java
    │       │           ├── PdfFileLoader.java
    │       │           └── FilesReader.java
    │       │       ├── RagscanApplication.java
    │       │       ├── store
    │       │           ├── QdrantStoreConfigurations.java
    │       │           ├── EmbeddingConfigurations.java
    │       │           └── GeminiOpenAiEmbeddingModel.java
    │       │       └── gemini
    │       │           └── GeminiCommand.java
    │   └── resources
    │       ├── application.properties
    │       └── docs
    │           ├── file.json
    │           └── initial-design.md
├── docker-compose.yaml
├── .gitignore
├── .mvn
    └── wrapper
    │   └── maven-wrapper.properties
├── LICENSE
├── .github
    └── workflows
    │   └── build.yml
├── settings.xml
├── README.md
├── pom.xml
├── mvnw.cmd
└── mvnw


/src/test/java/lunatix/ragscan/RagscanApplicationTests.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan;
 2 | 
 3 | import org.junit.jupiter.api.Test;
 4 | import org.springframework.boot.test.context.SpringBootTest;
 5 | 
 6 | @SpringBootTest
 7 | class RagscanApplicationTests {
 8 | 
 9 |     @Test
10 |     void contextLoads() {
11 |     }
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/loader/FileLoader.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan.loader;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import org.springframework.ai.document.Document;
 6 | import org.springframework.core.io.Resource;
 7 | 
 8 | public interface FileLoader {
 9 | 
10 |     public List<Document> load(Resource resource);
11 | }
12 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   qdrant:
 3 |     image: qdrant/qdrant:v1.13.0
 4 |     environment:
 5 |       QDRANT__SERVICE__GRPC_PORT: 6334
 6 |       QDRANT__SERVICE__REST_PORT: 6333
 7 |     ports:
 8 |       - "6333:6333" # tcp
 9 |       - "6334:6334" # grpc
10 |     volumes:
11 |       - ./qdrant_data:/qdrant/storage
12 | 
13 | volumes:
14 |   models_cache:
15 | 


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/loader/FileLoaderFactory.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan.loader;
 2 | 
 3 | public class FileLoaderFactory {
 4 | 
 5 |     private FileLoaderFactory() {}
 6 | 
 7 |     public static FileLoader create(FileType fileType) {
 8 |         return switch (fileType) {
 9 |             case PDF -> new PdfFileLoader();
10 |             case TXT -> new TextFileLoader();
11 |             case JSON, HTML, XML, OTHER -> new OtherFileLoader();
12 |         };
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/RagscanApplication.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan;
 2 | 
 3 | import org.springframework.boot.SpringApplication;
 4 | import org.springframework.boot.autoconfigure.SpringBootApplication;
 5 | import org.springframework.shell.command.annotation.CommandScan;
 6 | 
 7 | @SpringBootApplication
 8 | @CommandScan
 9 | public class RagscanApplication {
10 | 
11 |     public static void main(String[] args) {
12 |         SpringApplication.run(RagscanApplication.class, args);
13 |     }
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | HELP.md
 2 | target/
 3 | !.mvn/wrapper/maven-wrapper.jar
 4 | !**/src/main/**/target/
 5 | !**/src/test/**/target/
 6 | 
 7 | ### STS ###
 8 | .apt_generated
 9 | .classpath
10 | .factorypath
11 | .project
12 | .settings
13 | .springBeans
14 | .sts4-cache
15 | 
16 | ### IntelliJ IDEA ###
17 | .idea
18 | *.iws
19 | *.iml
20 | *.ipr
21 | 
22 | ### NetBeans ###
23 | /nbproject/private/
24 | /nbbuild/
25 | /dist/
26 | /nbdist/
27 | /.nb-gradle/
28 | build/
29 | !**/src/main/**/build/
30 | !**/src/test/**/build/
31 | 
32 | ### VS Code ###
33 | .vscode/
34 | 
35 | ### QDRANT
36 | /qdrant_data
37 | 
38 | ### logs
39 | *.log
40 | 


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/loader/JsonFileLoader.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan.loader;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import lombok.extern.slf4j.Slf4j;
 6 | import org.springframework.ai.document.Document;
 7 | import org.springframework.ai.reader.JsonReader;
 8 | import org.springframework.core.io.Resource;
 9 | 
10 | @Slf4j
11 | public class JsonFileLoader implements FileLoader {
12 | 
13 |     @Override
14 |     public List<Document> load(Resource resource) {
15 |         log.info("Loading json file {}", resource.getFilename());
16 |         final var jsonFile = new JsonReader(resource);
17 |         return jsonFile.read();
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/loader/TextFileLoader.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan.loader;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import lombok.extern.slf4j.Slf4j;
 6 | import org.springframework.ai.document.Document;
 7 | import org.springframework.ai.reader.TextReader;
 8 | import org.springframework.core.io.Resource;
 9 | 
10 | @Slf4j
11 | public class TextFileLoader implements FileLoader {
12 | 
13 |     @Override
14 |     public List<Document> load(Resource resource) {
15 |         log.info("Loading text file {}", resource.getFilename());
16 |         final var textReader = new TextReader(resource);
17 |         return textReader.read();
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/loader/OtherFileLoader.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan.loader;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import lombok.extern.slf4j.Slf4j;
 6 | import org.springframework.ai.document.Document;
 7 | import org.springframework.ai.reader.tika.TikaDocumentReader;
 8 | import org.springframework.core.io.Resource;
 9 | 
10 | @Slf4j
11 | public class OtherFileLoader implements FileLoader {
12 | 
13 |     /**
14 |      * This Supports variant of files, for example DOCX, PPTX, HTML, XML etc.
15 |      * <a href="https://tika.apache.org/2.9.0/formats.html">click</a> here for full list
16 |      */
17 |     @Override
18 |     public List<Document> load(Resource resource) {
19 |         log.info("Loading file {}", resource.getFilename());
20 |         final var file = new TikaDocumentReader(resource);
21 |         return file.read();
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/.mvn/wrapper/maven-wrapper.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   https://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | wrapperVersion=3.3.2
18 | distributionType=only-script
19 | distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.7/apache-maven-3.9.7-bin.zip
20 | 


--------------------------------------------------------------------------------
/src/main/resources/application.properties:
--------------------------------------------------------------------------------
 1 | spring.application.name=ragscan
 2 | 
 3 | spring.main.web-application-type=none
 4 | server.port=9090
 5 | 
 6 | spring.shell.interactive.enabled=true
 7 | spring.shell.script.enabled=true
 8 | 
 9 | spring.threads.virtual.enabled=true
10 | 
11 | spring.ai.vectorstore.qdrant.host=localhost
12 | spring.ai.vectorstore.qdrant.port=6334
13 | spring.ai.vectorstore.qdrant.collection-name=ragscan
14 | spring.ai.vectorstore.qdrant.initialize-schema=true
15 | 
16 | spring.ai.openai.api-key=${GOOGLE_API_KEY}
17 | spring.ai.openai.base-url=https://generativelanguage.googleapis.com
18 | spring.ai.openai.chat.options.model=gemini-2.0-flash
19 | spring.ai.openai.chat.completions-path=/v1beta/openai/chat/completions
20 | 
21 | spring.ai.openai.embedding.api-key=${GOOGLE_API_KEY}
22 | spring.ai.openai.embedding.base-url=https://generativelanguage.googleapis.com
23 | spring.ai.openai.embedding.embeddings-path=/v1beta/openai/embeddings
24 | spring.ai.openai.embedding.options.model=gemini-embedding-exp-03-07
25 | spring.ai.openai.embedding.options.dimensions=768


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Aland Osman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/loader/FileType.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan.loader;
 2 | 
 3 | import java.nio.file.Path;
 4 | import java.util.Arrays;
 5 | import java.util.List;
 6 | 
 7 | import lombok.AllArgsConstructor;
 8 | import lombok.Getter;
 9 | import lombok.extern.slf4j.Slf4j;
10 | import org.apache.commons.io.FilenameUtils;
11 | 
12 | @Getter
13 | @AllArgsConstructor
14 | @Slf4j
15 | public enum FileType {
16 |     HTML(List.of("htm", "html")),
17 |     TXT(List.of("txt")),
18 |     PDF(List.of("pdf")),
19 |     JSON(List.of("json")),
20 |     XML(List.of("xml")),
21 |     OTHER(List.of());
22 | 
23 |     private final List<String> fileExtensions;
24 | 
25 |     public static FileType fromFileExtension(String fileExtension) {
26 |         log.info("Getting fileType for {}", fileExtension);
27 |         return Arrays.stream(FileType.values())
28 |                 .filter(f -> f.getFileExtensions().contains(fileExtension.toLowerCase()))
29 |                 .findFirst()
30 |                 .orElse(OTHER);
31 |     }
32 | 
33 |     public static String getFileExtension(Path filePath) {
34 |         log.info("Getting file extension for {}", filePath);
35 |         return FilenameUtils.getExtension(filePath.toString());
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/loader/PdfFileLoader.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan.loader;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import lombok.extern.slf4j.Slf4j;
 6 | import org.springframework.ai.document.Document;
 7 | import org.springframework.ai.reader.ExtractedTextFormatter;
 8 | import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
 9 | import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
10 | import org.springframework.core.io.Resource;
11 | 
12 | @Slf4j
13 | public class PdfFileLoader implements FileLoader {
14 | 
15 |     @Override
16 |     public List<Document> load(Resource resource) {
17 |         log.info("Loading PDF Document {}", resource.getFilename());
18 |         PagePdfDocumentReader pdfFile = new PagePdfDocumentReader(resource,
19 |                 PdfDocumentReaderConfig.builder()
20 |                         .withPageTopMargin(0)
21 |                         .withPageExtractedTextFormatter(ExtractedTextFormatter.builder()
22 |                                 .withNumberOfTopTextLinesToDelete(0)
23 |                                 .build())
24 |                         .withPagesPerDocument(1)
25 |                         .build());
26 | 
27 |         return pdfFile.read();
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/store/QdrantStoreConfigurations.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan.store;
 2 | 
 3 | import java.util.concurrent.Future;
 4 | 
 5 | import io.qdrant.client.QdrantClient;
 6 | import io.qdrant.client.grpc.Collections;
 7 | import io.vavr.control.Try;
 8 | import lombok.RequiredArgsConstructor;
 9 | import org.springframework.beans.factory.annotation.Value;
10 | import org.springframework.shell.standard.ShellComponent;
11 | import org.springframework.shell.standard.ShellMethod;
12 | 
13 | @RequiredArgsConstructor
14 | @ShellComponent
15 | public class QdrantStoreConfigurations {
16 | 
17 |     private final QdrantClient qdrantClient;
18 | 
19 |     @Value("${spring.ai.vectorstore.qdrant.collection-name}")
20 |     private String collectionName;
21 | 
22 |     @ShellMethod(
23 |             key = "collection-size",
24 |             value = """
25 |                     Give it a desired collection size
26 |                     """,
27 |             group = "Prerequisite")
28 |     public void saveCollectionSize(int size) {
29 |         Try.of(() -> Collections.VectorParams.newBuilder().setSize(size)
30 |                         .setDistance(Collections.Distance.Cosine)
31 |                         .build())
32 |                 .map(vectorParams -> qdrantClient.recreateCollectionAsync(collectionName, vectorParams))
33 |                 .andThenTry(Future::get);
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build Ragscan
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   build-with-graal:
 5 |     if: false # currently disabled
 6 |     name: Ragscan on ${{ matrix.os }}
 7 |     runs-on: ${{ matrix.os }}
 8 |     strategy:
 9 |       matrix:
10 |         os: [windows-latest, ubuntu-latest]
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       - uses: graalvm/setup-graalvm@v1
14 |         with:
15 |           java-version: '22'
16 |           distribution: 'graalvm'
17 |           github-token: ${{ secrets.GITHUB_TOKEN }}
18 |           native-image-job-reports: 'true'
19 | 
20 |       - name: Build Ragscan
21 |         run: |
22 |           echo "GRAALVM_HOME: $GRAALVM_HOME"
23 |           echo "JAVA_HOME: $JAVA_HOME"
24 |           java --version
25 |           native-image --version
26 |       - name: Compile with maven
27 |         run: mvn -X -Pnative native:compile -DskipTests
28 | 
29 |       - name: Upload binary
30 |         uses: actions/upload-artifact@v4
31 |         with:
32 |           name: ragscan-${{ matrix.os }}
33 |           path: ragscan*
34 |   build:
35 |     name: Ragscan
36 |     runs-on: ubuntu-latest
37 |     steps:
38 |       - uses: actions/checkout@v4
39 |       - name: Set up Java
40 |         uses: actions/setup-java@v4
41 |         with:
42 |           distribution: oracle
43 |           java-version: 21
44 |       - name: Build Ragscan
45 |         run: mvn -f pom.xml clean package -DskipTests
46 |       - name: Upload build artifact
47 |         if: github.ref_name == 'master'
48 |         uses: actions/upload-artifact@v4
49 |         with:
50 |           name: artifact
51 |           path: ./target/*.jar


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/store/EmbeddingConfigurations.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan.store;
 2 | 
 3 | import io.micrometer.observation.ObservationRegistry;
 4 | import org.springframework.ai.embedding.EmbeddingModel;
 5 | import org.springframework.ai.openai.OpenAiEmbeddingOptions;
 6 | import org.springframework.ai.openai.api.OpenAiApi;
 7 | import org.springframework.beans.factory.annotation.Value;
 8 | import org.springframework.context.annotation.Bean;
 9 | import org.springframework.context.annotation.Configuration;
10 | import org.springframework.retry.support.RetryTemplate;
11 | 
12 | @Configuration
13 | class EmbeddingConfigurations {
14 | 
15 |     @Value("${spring.ai.openai.api-key}")
16 |     private String apiKey;
17 | 
18 |     @Value("${spring.ai.openai.base-url}")
19 |     private String baseUrl;
20 | 
21 | 
22 |     @Value("${spring.ai.openai.embedding.options.model}")
23 |     private String embeddingModel;
24 | 
25 |     @Value("${spring.ai.openai.embedding.embeddings-path}")
26 |     private String embeddingPath;
27 | 
28 |     @Value("${spring.ai.openai.embedding.options.dimensions}")
29 |     private Integer embeddingDimension;
30 | 
31 |     @Bean
32 |     EmbeddingModel embeddingModel() {
33 |         final var openAiApi = OpenAiApi.builder()
34 |                 .apiKey(apiKey)
35 |                 .baseUrl(baseUrl)
36 |                 .embeddingsPath(embeddingPath)
37 |                 .build();
38 |         return new GeminiOpenAiEmbeddingModel(
39 |                 openAiApi,
40 |                 OpenAiEmbeddingOptions.builder()
41 |                         .model(embeddingModel)
42 |                         .dimensions(embeddingDimension)
43 |                         .build(),
44 |                 RetryTemplate.builder()
45 |                         .maxAttempts(10)
46 |                         .fixedBackoff(1000)
47 |                         .build(),
48 |                 ObservationRegistry.create()
49 |         );
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/settings.xml:
--------------------------------------------------------------------------------
 1 | <settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"
 2 |           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |           xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0
 4 |                               http://maven.apache.org/xsd/settings-1.0.0.xsd">
 5 |     <mirrors>
 6 |         <mirror>
 7 |             <id>spring-milestones</id>
 8 |             <url>https://repo.spring.io/milestone</url>
 9 |             <mirrorOf>*</mirrorOf>
10 |         </mirror>
11 |         <mirror>
12 |             <id>spring-snapshots</id>
13 |             <url>https://repo.spring.io/snapshot</url>
14 |             <mirrorOf>*</mirrorOf>
15 |         </mirror>
16 |         <mirror>
17 |             <id>central</id>
18 |             <mirrorOf>spring-snapshots</mirrorOf>
19 |             <url>https://repo.maven.apache.org/maven2</url>
20 |         </mirror>
21 | 
22 |     </mirrors>
23 |     <profiles>
24 |         <profile>
25 |             <id>ok</id>
26 |             <repositories>
27 |                 <repository>
28 |                     <id>spring-milestones</id>
29 |                     <name>Spring Milestones</name>
30 |                     <url>https://repo.spring.io/milestone</url>
31 |                     <snapshots>
32 |                         <enabled>false</enabled>
33 |                     </snapshots>
34 |                 </repository>
35 |                 <repository>
36 |                     <snapshots>
37 |                         <enabled>false</enabled>
38 |                     </snapshots>
39 |                     <id>central</id>
40 |                     <name>Maven Repository Switchboard</name>
41 |                     <url>https://repo1.maven.org/maven2</url>
42 |                 </repository>
43 |                 <repository>
44 |                     <id>spring-snapshots</id>
45 |                     <name>Spring Snapshots</name>
46 |                     <url>https://repo.spring.io/snapshot</url>
47 |                     <releases>
48 |                         <enabled>false</enabled>
49 |                     </releases>
50 |                 </repository>
51 |             </repositories>
52 |         </profile>
53 |     </profiles>
54 |     <activeProfiles>
55 |         <activeProfile>ok</activeProfile>
56 |     </activeProfiles>
57 | </settings>


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/gemini/GeminiCommand.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan.gemini;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import lombok.RequiredArgsConstructor;
 6 | import org.springframework.ai.chat.messages.Message;
 7 | import org.springframework.ai.chat.messages.SystemMessage;
 8 | import org.springframework.ai.chat.messages.UserMessage;
 9 | import org.springframework.ai.chat.model.ChatModel;
10 | import org.springframework.ai.document.Document;
11 | import org.springframework.ai.vectorstore.SearchRequest;
12 | import org.springframework.ai.vectorstore.VectorStore;
13 | import org.springframework.shell.standard.ShellComponent;
14 | import org.springframework.shell.standard.ShellMethod;
15 | 
16 | @ShellComponent
17 | @RequiredArgsConstructor
18 | public class GeminiCommand {
19 | 
20 |     private final VectorStore vectorStore;
21 |     private final ChatModel chatModel;
22 | 
23 |     @ShellMethod(
24 |             key = "ask",
25 |             value = "Ask a question, Note: you need to load files",
26 |             group = "Chat")
27 |     private String ask(String question) {
28 |         final var resultInDB = vectorStore.similaritySearch(
29 |                 SearchRequest.builder()
30 |                         .query(question)
31 |                         .topK(5)
32 |                         .build()
33 |         );
34 |         assert resultInDB != null && !resultInDB.isEmpty()
35 |                 : "error getting context";
36 | 
37 |         final var generatedPrompt = getGenerateContentRequest(question, resultInDB)
38 |                 .toArray(new Message[0]);
39 |         return chatModel.call(generatedPrompt);
40 |     }
41 | 
42 |     private static List<Message> getGenerateContentRequest(String message, List<Document> resultInDB) {
43 |         final var systemContent = """
44 |                 Answer only from the data you got as input, otherwise say you don't know, and clean up weird formats
45 |                 like if it's json clean it up, if it's Markdown clean it up etc...
46 |                 Your name is Ragscan.
47 |                 You will always get some questions with some context. Use the context only.
48 |                 """;
49 |         final var systemMessage = new SystemMessage(systemContent);
50 |         final var messageContent = """
51 |                 Question: %s
52 |                 Context: %s
53 |                 """;
54 | 
55 |         final var userMessage = new UserMessage(String.format(messageContent, message, resultInDB.toString()));
56 | 
57 |         return List.of(
58 |                 systemMessage,
59 |                 userMessage
60 |         );
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/loader/FilesReader.java:
--------------------------------------------------------------------------------
 1 | package lunatix.ragscan.loader;
 2 | 
 3 | import java.nio.file.Files;
 4 | import java.nio.file.Path;
 5 | import java.nio.file.Paths;
 6 | import java.util.List;
 7 | 
 8 | import io.vavr.control.Try;
 9 | import lombok.RequiredArgsConstructor;
10 | import lombok.extern.slf4j.Slf4j;
11 | import org.springframework.ai.transformer.splitter.TextSplitter;
12 | import org.springframework.ai.transformer.splitter.TokenTextSplitter;
13 | import org.springframework.ai.vectorstore.VectorStore;
14 | import org.springframework.core.io.InputStreamResource;
15 | import org.springframework.shell.standard.ShellComponent;
16 | import org.springframework.shell.standard.ShellMethod;
17 | 
18 | @RequiredArgsConstructor
19 | @ShellComponent
20 | @Slf4j
21 | public class FilesReader {
22 | 
23 |     private final VectorStore vectorStore;
24 | 
25 |     @ShellMethod(
26 |             key = "load",
27 |             value = "Give it the main folder and it will load supported files inside of it",
28 |             group = "Prerequisite")
29 |     public String loadFiles(String fullPath) {
30 |         return Try.withResources(() -> Files.walk(Path.of(fullPath)))
31 |                 .of(paths -> paths
32 |                         .filter(Files::isRegularFile)
33 |                         .map(path -> {
34 |                             final var fileExtension = FileType.getFileExtension(path.getFileName());
35 |                             final var fileType = FileType.fromFileExtension(fileExtension);
36 |                             final var fileLoader = FileLoaderFactory.create(fileType);
37 |                             return Try.of(() -> Files.newInputStream(path))
38 |                                     .map(InputStreamResource::new)
39 |                                     .map(fileLoader::load)
40 |                                     .peek(documents -> {
41 |                                         final var splitter = new TokenTextSplitter();
42 |                                         final var splitDocuments = splitter.apply(documents);
43 |                                         log.info("Adding documents...");
44 |                                         vectorStore.accept(splitDocuments);
45 |                                         log.info("added {} documents", splitDocuments.size());
46 |                                     })
47 |                                     .get();
48 |                         })
49 |                         .toList()
50 |                 )
51 |                 .map(ignored -> "loaded Successfully")
52 |                 .getOrElseThrow(throwable -> new RuntimeException("Error while loading file", throwable));
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Simple CLI Retrieval Augmented Generation Scanner
 2 | =================================================
 3 | Aim of the project: A showcase of a RAG scanner written in Java and using [Spring AI](https://docs.spring.io/spring-ai/reference/api/index.html), which scans the targeted documents and you can ask questions to the LLM regarding the given documents.
 4 | 
 5 | ## Disclaimer
 6 | This tool is intended for educational and productivity purposes only. It is designed to assist users in managing and querying their own documents. Any illegal or unethical use of this software is strictly prohibited.
 7 | 
 8 | ## Requirements
 9 | 1. [Java 21](https://www.oracle.com/java/technologies/javase/jdk21-archive-downloads.html) installed on your device
10 | 2. [Docker](https://www.docker.com/products/docker-desktop/)
11 | 3. An environment variable named `GOOGLE_API_KEY` and add your [Google Gemini API key](https://ai.google.dev/gemini-api/docs/api-key)
12 | 
13 | ## Installation
14 | 1. Navigate to the project directory
15 | 2. Open CMD/Powershell/Terminal
16 | 3. For Windows Run `./mvnw clean install`, for Linux/Mac run `./mvn clean install`
17 | 
18 | ## How to use:
19 | 1. Run `docker-compose up` in your CMD/Powershell/Terminal
20 | 2. Run the project using maven, on Windows: `./mvnw spring-boot:run`, on Linux/Mac run `./mvn spring-boot:run`.
21 | 3. When the shell opens type `collection-size 768` (for Gemini `768` is compatible).
22 | 4. Place your files in a directory, copy the full path of the directory, and run something like this `load //your//path`, wait till the files are chunked and loaded to `Qdrant vector database`.
23 | 5. Finally in the shell write `ask "your question here"` and that's it.
24 | 
25 | 
26 | ### Notes
27 | It's a simple project, needs a lot of improvements like: 
28 | 1. Improve chunking documents (Currently chunked by token size)
29 | 2. Support more file types (Currently supports txt, HTML, JSON, MD, docx, ppt, pdf, and a lot more)
30 | 3. Support other Chat models and Embeddings like GPT, Ollama, etc... (currently supports Gemini version `gemini-2.0-flash` and embedding `gemini-embedding-exp-03-07`, the reason I decided to use Gemini is that it has a good free tier)
31 | 4. Support to make it a standalone executable and a jar file, (Currently you can build it yourself and run it, it has no problem, but I will simplify it)
32 | 5. Support other vector databases ( Currently supports Qdrant, to be honest, it's good enough)
33 | 6. Support custom System Context and custom similar returned documents in DB (Default, for now, is 5.)
34 | 
35 | #### Rabbit hole
36 | Don't try to retrieve an API key from older `.git` versions, it's a rabbit hole :)
37 | 
38 | Please create an Issue, if something is wrong I will look into it, and feel free to contribute to the project.
39 | ==============
40 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
  4 |     <modelVersion>4.0.0</modelVersion>
  5 |     <parent>
  6 |         <groupId>org.springframework.boot</groupId>
  7 |         <artifactId>spring-boot-starter-parent</artifactId>
  8 |         <version>3.3.1</version>
  9 |         <relativePath/>
 10 |     </parent>
 11 |     <groupId>lunatix</groupId>
 12 |     <artifactId>ragscan</artifactId>
 13 |     <version>0.0.1-SNAPSHOT</version>
 14 |     <name>ragscan</name>
 15 |     <description>ragscan</description>
 16 |     <url/>
 17 |     <licenses>
 18 |         <license/>
 19 |     </licenses>
 20 |     <developers>
 21 |         <developer/>
 22 |     </developers>
 23 |     <scm>
 24 |         <connection/>
 25 |         <developerConnection/>
 26 |         <tag/>
 27 |         <url/>
 28 |     </scm>
 29 |     <properties>
 30 |         <java.version>21</java.version>
 31 |         <spring-ai.version>1.0.0-SNAPSHOT</spring-ai.version>
 32 |         <spring-shell.version>3.3.0</spring-shell.version>
 33 |         <maven.compiler.source>21</maven.compiler.source>
 34 |         <maven.compiler.target>21</maven.compiler.target>
 35 |     </properties>
 36 |     <dependencies>
 37 |         <dependency>
 38 |             <groupId>org.springframework.ai</groupId>
 39 |             <artifactId>spring-ai-qdrant-store-spring-boot-starter</artifactId>
 40 |         </dependency>
 41 |         <dependency>
 42 |             <groupId>org.springframework.shell</groupId>
 43 |             <artifactId>spring-shell-starter</artifactId>
 44 |         </dependency>
 45 |         <dependency>
 46 |             <groupId>org.jsoup</groupId>
 47 |             <artifactId>jsoup</artifactId>
 48 |             <version>1.18.1</version>
 49 |         </dependency>
 50 |         <dependency>
 51 |             <groupId>org.projectlombok</groupId>
 52 |             <artifactId>lombok</artifactId>
 53 |             <optional>true</optional>
 54 |         </dependency>
 55 |         <dependency>
 56 |             <groupId>org.springframework.boot</groupId>
 57 |             <artifactId>spring-boot-starter-test</artifactId>
 58 |             <scope>test</scope>
 59 |         </dependency>
 60 |         <dependency>
 61 |             <groupId>org.springframework.shell</groupId>
 62 |             <artifactId>spring-shell-starter-test</artifactId>
 63 |             <scope>test</scope>
 64 |         </dependency>
 65 |         <dependency>
 66 |             <groupId>org.springframework.ai</groupId>
 67 |             <artifactId>spring-ai-tika-document-reader</artifactId>
 68 |             <version>1.0.0-SNAPSHOT</version>
 69 |         </dependency>
 70 |         <dependency>
 71 |             <groupId>org.apache.maven.plugins</groupId>
 72 |             <artifactId>maven-source-plugin</artifactId>
 73 |             <version>3.3.1</version>
 74 |         </dependency>
 75 |         <dependency>
 76 |             <groupId>org.graalvm.buildtools</groupId>
 77 |             <artifactId>native-maven-plugin</artifactId>
 78 |             <version>0.10.2</version>
 79 |         </dependency>
 80 |         <dependency>
 81 |             <groupId>org.springframework.boot</groupId>
 82 |             <artifactId>spring-boot-maven-plugin</artifactId>
 83 |             <version>3.3.1</version>
 84 |         </dependency>
 85 |         <dependency>
 86 |             <groupId>org.springframework.ai</groupId>
 87 |             <artifactId>spring-ai-pdf-document-reader</artifactId>
 88 |             <version>1.0.0-SNAPSHOT</version>
 89 |         </dependency>
 90 |         <dependency>
 91 |             <groupId>org.springframework.ai</groupId>
 92 |             <artifactId>spring-ai-openai-spring-boot-starter</artifactId>
 93 |         </dependency>
 94 |         <dependency>
 95 |             <groupId>org.springframework.ai</groupId>
 96 |             <artifactId>spring-ai-qdrant-store-spring-boot-starter</artifactId>
 97 |         </dependency>
 98 |         <dependency>
 99 |             <groupId>org.apache.httpcomponents.client5</groupId>
100 |             <artifactId>httpclient5</artifactId>
101 |             <version>5.2.1</version>
102 |         </dependency>
103 | 
104 |         <dependency>
105 |             <groupId>io.vavr</groupId>
106 |             <artifactId>vavr</artifactId>
107 |             <version>0.10.4</version>
108 |         </dependency>
109 |     </dependencies>
110 |     <dependencyManagement>
111 |         <dependencies>
112 |             <dependency>
113 |                 <groupId>org.springframework.ai</groupId>
114 |                 <artifactId>spring-ai-bom</artifactId>
115 |                 <version>${spring-ai.version}</version>
116 |                 <type>pom</type>
117 |                 <scope>import</scope>
118 |             </dependency>
119 |             <dependency>
120 |                 <groupId>org.springframework.shell</groupId>
121 |                 <artifactId>spring-shell-dependencies</artifactId>
122 |                 <version>${spring-shell.version}</version>
123 |                 <type>pom</type>
124 |                 <scope>import</scope>
125 |             </dependency>
126 |         </dependencies>
127 |     </dependencyManagement>
128 | 
129 |     <build>
130 |         <plugins>
131 |             <plugin>
132 |                 <groupId>org.graalvm.buildtools</groupId>
133 |                 <artifactId>native-maven-plugin</artifactId>
134 |                 <version>0.10.2</version>
135 |             </plugin>
136 |             <plugin>
137 |                 <groupId>org.springframework.boot</groupId>
138 |                 <artifactId>spring-boot-maven-plugin</artifactId>
139 |                 <configuration>
140 |                     <excludes>
141 |                         <exclude>
142 |                             <groupId>org.projectlombok</groupId>
143 |                             <artifactId>lombok</artifactId>
144 |                         </exclude>
145 |                     </excludes>
146 |                 </configuration>
147 |             </plugin>
148 |         </plugins>
149 |     </build>
150 |     <repositories>
151 |         <repository>
152 |             <id>spring-snapshots</id>
153 |             <name>Spring Snapshots</name>
154 |             <url>https://repo.spring.io/snapshot</url>
155 |             <releases>
156 |                 <enabled>false</enabled>
157 |             </releases>
158 |         </repository>
159 |     </repositories>
160 | </project>
161 | 


--------------------------------------------------------------------------------
/src/main/java/lunatix/ragscan/store/GeminiOpenAiEmbeddingModel.java:
--------------------------------------------------------------------------------
  1 | package lunatix.ragscan.store;
  2 | 
  3 | import java.util.List;
  4 | import java.util.Objects;
  5 | 
  6 | import io.micrometer.observation.ObservationConvention;
  7 | import io.micrometer.observation.ObservationRegistry;
  8 | import org.slf4j.Logger;
  9 | import org.slf4j.LoggerFactory;
 10 | import org.springframework.ai.chat.metadata.DefaultUsage;
 11 | import org.springframework.ai.embedding.Embedding;
 12 | import org.springframework.ai.embedding.EmbeddingOptions;
 13 | import org.springframework.ai.embedding.EmbeddingRequest;
 14 | import org.springframework.ai.embedding.EmbeddingResponse;
 15 | import org.springframework.ai.embedding.EmbeddingResponseMetadata;
 16 | import org.springframework.ai.embedding.observation.DefaultEmbeddingModelObservationConvention;
 17 | import org.springframework.ai.embedding.observation.EmbeddingModelObservationContext;
 18 | import org.springframework.ai.embedding.observation.EmbeddingModelObservationDocumentation;
 19 | import org.springframework.ai.model.ModelOptionsUtils;
 20 | import org.springframework.ai.openai.OpenAiEmbeddingModel;
 21 | import org.springframework.ai.openai.OpenAiEmbeddingOptions;
 22 | import org.springframework.ai.openai.api.OpenAiApi;
 23 | import org.springframework.ai.openai.api.common.OpenAiApiConstants;
 24 | import org.springframework.lang.Nullable;
 25 | import org.springframework.retry.support.RetryTemplate;
 26 | 
 27 | public class GeminiOpenAiEmbeddingModel extends OpenAiEmbeddingModel {
 28 | 
 29 |     private static final Logger logger = LoggerFactory.getLogger(GeminiOpenAiEmbeddingModel.class);
 30 | 
 31 | 
 32 |     private static final ObservationConvention<EmbeddingModelObservationContext> DEFAULT_OBSERVATION_CONVENTION = new DefaultEmbeddingModelObservationConvention();
 33 |     private static final ObservationConvention<EmbeddingModelObservationContext> OBSERVATION_CONVENTION = DEFAULT_OBSERVATION_CONVENTION;
 34 |     private final OpenAiEmbeddingOptions defaultOptions;
 35 | 
 36 |     private final RetryTemplate retryTemplate;
 37 |     private final ObservationRegistry observationRegistry;
 38 |     private final OpenAiApi openAiApi;
 39 | 
 40 |     public GeminiOpenAiEmbeddingModel(OpenAiApi openAiApi,
 41 |                                       OpenAiEmbeddingOptions defaultOptions,
 42 |                                       RetryTemplate retryTemplate,
 43 |                                       ObservationRegistry observationRegistry) {
 44 |         super(openAiApi);
 45 |         this.defaultOptions = defaultOptions;
 46 |         this.retryTemplate = retryTemplate;
 47 |         this.observationRegistry = observationRegistry;
 48 |         this.openAiApi = openAiApi;
 49 |     }
 50 | 
 51 |     @Override
 52 |     public EmbeddingResponse call(EmbeddingRequest request) {
 53 |         OpenAiEmbeddingOptions requestOptions = mergeOptions(request.getOptions(), this.defaultOptions);
 54 |         OpenAiApi.EmbeddingRequest<List<String>> apiRequest = createRequest(request, requestOptions);
 55 | 
 56 |         var observationContext = EmbeddingModelObservationContext.builder()
 57 |                 .embeddingRequest(request)
 58 |                 .provider(OpenAiApiConstants.PROVIDER_NAME)
 59 |                 .requestOptions(requestOptions)
 60 |                 .build();
 61 | 
 62 |         return Objects.requireNonNull(EmbeddingModelObservationDocumentation.EMBEDDING_MODEL_OPERATION
 63 |                 .observation(OBSERVATION_CONVENTION, DEFAULT_OBSERVATION_CONVENTION, () -> observationContext,
 64 |                         this.observationRegistry)
 65 |                 .observe(() -> {
 66 |                     OpenAiApi.EmbeddingList<OpenAiApi.Embedding> apiEmbeddingResponse = this.retryTemplate
 67 |                             .execute(ctx -> this.openAiApi.embeddings(apiRequest).getBody());
 68 | 
 69 |                     if (apiEmbeddingResponse == null) {
 70 |                         logger.warn("No embeddings returned for request: {}", request);
 71 |                         return new EmbeddingResponse(List.of());
 72 |                     }
 73 | 
 74 |                     var metadata = new EmbeddingResponseMetadata(apiEmbeddingResponse.model(),
 75 |                             getDefaultUsage(apiEmbeddingResponse.usage()));
 76 | 
 77 |                     List<Embedding> embeddings = apiEmbeddingResponse.data()
 78 |                             .stream()
 79 |                             .map(e -> new Embedding(e.embedding(), e.index()))
 80 |                             .toList();
 81 | 
 82 |                     EmbeddingResponse embeddingResponse = new EmbeddingResponse(embeddings, metadata);
 83 | 
 84 |                     observationContext.setResponse(embeddingResponse);
 85 | 
 86 |                     return embeddingResponse;
 87 |                 }));
 88 |     }
 89 | 
 90 |     private OpenAiEmbeddingOptions mergeOptions(@Nullable EmbeddingOptions runtimeOptions,
 91 |                                                 OpenAiEmbeddingOptions defaultOptions) {
 92 |         var runtimeOptionsForProvider = ModelOptionsUtils.copyToTarget(runtimeOptions, EmbeddingOptions.class,
 93 |                 OpenAiEmbeddingOptions.class);
 94 | 
 95 |         if (runtimeOptionsForProvider == null) {
 96 |             return defaultOptions;
 97 |         }
 98 | 
 99 |         return OpenAiEmbeddingOptions.builder()
100 |                 // Handle portable embedding options
101 |                 .model(ModelOptionsUtils.mergeOption(runtimeOptionsForProvider.getModel(), defaultOptions.getModel()))
102 |                 .dimensions(ModelOptionsUtils.mergeOption(runtimeOptionsForProvider.getDimensions(),
103 |                         defaultOptions.getDimensions()))
104 |                 // Handle OpenAI specific embedding options
105 |                 .encodingFormat(ModelOptionsUtils.mergeOption(runtimeOptionsForProvider.getEncodingFormat(),
106 |                         defaultOptions.getEncodingFormat()))
107 |                 .user(ModelOptionsUtils.mergeOption(runtimeOptionsForProvider.getUser(), defaultOptions.getUser()))
108 |                 .build();
109 |     }
110 | 
111 |     private OpenAiApi.EmbeddingRequest<List<String>> createRequest(EmbeddingRequest request,
112 |                                                                    OpenAiEmbeddingOptions requestOptions) {
113 |         return new OpenAiApi.EmbeddingRequest<>(request.getInstructions(), requestOptions.getModel(),
114 |                 requestOptions.getEncodingFormat(), requestOptions.getDimensions(), requestOptions.getUser());
115 |     }
116 | 
117 |     /*
118 |         Because Gemini doesn't provide default usage we will use a mock data, otherwise we will get an NPE exception
119 |         that's why this class is created.
120 |      */
121 |     private DefaultUsage getDefaultUsage(OpenAiApi.Usage usage) {
122 |         return new DefaultUsage(Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE, usage);
123 |     }
124 | }
125 | 


--------------------------------------------------------------------------------
/mvnw.cmd:
--------------------------------------------------------------------------------
  1 | <# : batch portion
  2 | @REM ----------------------------------------------------------------------------
  3 | @REM Licensed to the Apache Software Foundation (ASF) under one
  4 | @REM or more contributor license agreements.  See the NOTICE file
  5 | @REM distributed with this work for additional information
  6 | @REM regarding copyright ownership.  The ASF licenses this file
  7 | @REM to you under the Apache License, Version 2.0 (the
  8 | @REM "License"); you may not use this file except in compliance
  9 | @REM with the License.  You may obtain a copy of the License at
 10 | @REM
 11 | @REM    https://www.apache.org/licenses/LICENSE-2.0
 12 | @REM
 13 | @REM Unless required by applicable law or agreed to in writing,
 14 | @REM software distributed under the License is distributed on an
 15 | @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | @REM KIND, either express or implied.  See the License for the
 17 | @REM specific language governing permissions and limitations
 18 | @REM under the License.
 19 | @REM ----------------------------------------------------------------------------
 20 | 
 21 | @REM ----------------------------------------------------------------------------
 22 | @REM Apache Maven Wrapper startup batch script, version 3.3.2
 23 | @REM
 24 | @REM Optional ENV vars
 25 | @REM   MVNW_REPOURL - repo url base for downloading maven distribution
 26 | @REM   MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven
 27 | @REM   MVNW_VERBOSE - true: enable verbose log; others: silence the output
 28 | @REM ----------------------------------------------------------------------------
 29 | 
 30 | @IF "%__MVNW_ARG0_NAME__%"=="" (SET __MVNW_ARG0_NAME__=%~nx0)
 31 | @SET __MVNW_CMD__=
 32 | @SET __MVNW_ERROR__=
 33 | @SET __MVNW_PSMODULEP_SAVE=%PSModulePath%
 34 | @SET PSModulePath=
 35 | @FOR /F "usebackq tokens=1* delims==" %%A IN (`powershell -noprofile "& {$scriptDir='%~dp0'; $script='%__MVNW_ARG0_NAME__%'; icm -ScriptBlock ([Scriptblock]::Create((Get-Content -Raw '%~f0'))) -NoNewScope}"`) DO @(
 36 |   IF "%%A"=="MVN_CMD" (set __MVNW_CMD__=%%B) ELSE IF "%%B"=="" (echo %%A) ELSE (echo %%A=%%B)
 37 | )
 38 | @SET PSModulePath=%__MVNW_PSMODULEP_SAVE%
 39 | @SET __MVNW_PSMODULEP_SAVE=
 40 | @SET __MVNW_ARG0_NAME__=
 41 | @SET MVNW_USERNAME=
 42 | @SET MVNW_PASSWORD=
 43 | @IF NOT "%__MVNW_CMD__%"=="" (%__MVNW_CMD__% %*)
 44 | @echo Cannot start maven from wrapper >&2 && exit /b 1
 45 | @GOTO :EOF
 46 | : end batch / begin powershell #>
 47 | 
 48 | $ErrorActionPreference = "Stop"
 49 | if ($env:MVNW_VERBOSE -eq "true") {
 50 |   $VerbosePreference = "Continue"
 51 | }
 52 | 
 53 | # calculate distributionUrl, requires .mvn/wrapper/maven-wrapper.properties
 54 | $distributionUrl = (Get-Content -Raw "$scriptDir/.mvn/wrapper/maven-wrapper.properties" | ConvertFrom-StringData).distributionUrl
 55 | if (!$distributionUrl) {
 56 |   Write-Error "cannot read distributionUrl property in $scriptDir/.mvn/wrapper/maven-wrapper.properties"
 57 | }
 58 | 
 59 | switch -wildcard -casesensitive ( $($distributionUrl -replace '^.*/','') ) {
 60 |   "maven-mvnd-*" {
 61 |     $USE_MVND = $true
 62 |     $distributionUrl = $distributionUrl -replace '-bin\.[^.]*$',"-windows-amd64.zip"
 63 |     $MVN_CMD = "mvnd.cmd"
 64 |     break
 65 |   }
 66 |   default {
 67 |     $USE_MVND = $false
 68 |     $MVN_CMD = $script -replace '^mvnw','mvn'
 69 |     break
 70 |   }
 71 | }
 72 | 
 73 | # apply MVNW_REPOURL and calculate MAVEN_HOME
 74 | # maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash>
 75 | if ($env:MVNW_REPOURL) {
 76 |   $MVNW_REPO_PATTERN = if ($USE_MVND) { "/org/apache/maven/" } else { "/maven/mvnd/" }
 77 |   $distributionUrl = "$env:MVNW_REPOURL$MVNW_REPO_PATTERN$($distributionUrl -replace '^.*'+$MVNW_REPO_PATTERN,'')"
 78 | }
 79 | $distributionUrlName = $distributionUrl -replace '^.*/',''
 80 | $distributionUrlNameMain = $distributionUrlName -replace '\.[^.]*$','' -replace '-bin$',''
 81 | $MAVEN_HOME_PARENT = "$HOME/.m2/wrapper/dists/$distributionUrlNameMain"
 82 | if ($env:MAVEN_USER_HOME) {
 83 |   $MAVEN_HOME_PARENT = "$env:MAVEN_USER_HOME/wrapper/dists/$distributionUrlNameMain"
 84 | }
 85 | $MAVEN_HOME_NAME = ([System.Security.Cryptography.MD5]::Create().ComputeHash([byte[]][char[]]$distributionUrl) | ForEach-Object {$_.ToString("x2")}) -join ''
 86 | $MAVEN_HOME = "$MAVEN_HOME_PARENT/$MAVEN_HOME_NAME"
 87 | 
 88 | if (Test-Path -Path "$MAVEN_HOME" -PathType Container) {
 89 |   Write-Verbose "found existing MAVEN_HOME at $MAVEN_HOME"
 90 |   Write-Output "MVN_CMD=$MAVEN_HOME/bin/$MVN_CMD"
 91 |   exit $?
 92 | }
 93 | 
 94 | if (! $distributionUrlNameMain -or ($distributionUrlName -eq $distributionUrlNameMain)) {
 95 |   Write-Error "distributionUrl is not valid, must end with *-bin.zip, but found $distributionUrl"
 96 | }
 97 | 
 98 | # prepare tmp dir
 99 | $TMP_DOWNLOAD_DIR_HOLDER = New-TemporaryFile
100 | $TMP_DOWNLOAD_DIR = New-Item -Itemtype Directory -Path "$TMP_DOWNLOAD_DIR_HOLDER.dir"
101 | $TMP_DOWNLOAD_DIR_HOLDER.Delete() | Out-Null
102 | trap {
103 |   if ($TMP_DOWNLOAD_DIR.Exists) {
104 |     try { Remove-Item $TMP_DOWNLOAD_DIR -Recurse -Force | Out-Null }
105 |     catch { Write-Warning "Cannot remove $TMP_DOWNLOAD_DIR" }
106 |   }
107 | }
108 | 
109 | New-Item -Itemtype Directory -Path "$MAVEN_HOME_PARENT" -Force | Out-Null
110 | 
111 | # Download and Install Apache Maven
112 | Write-Verbose "Couldn't find MAVEN_HOME, downloading and installing it ..."
113 | Write-Verbose "Downloading from: $distributionUrl"
114 | Write-Verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName"
115 | 
116 | $webclient = New-Object System.Net.WebClient
117 | if ($env:MVNW_USERNAME -and $env:MVNW_PASSWORD) {
118 |   $webclient.Credentials = New-Object System.Net.NetworkCredential($env:MVNW_USERNAME, $env:MVNW_PASSWORD)
119 | }
120 | [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
121 | $webclient.DownloadFile($distributionUrl, "$TMP_DOWNLOAD_DIR/$distributionUrlName") | Out-Null
122 | 
123 | # If specified, validate the SHA-256 sum of the Maven distribution zip file
124 | $distributionSha256Sum = (Get-Content -Raw "$scriptDir/.mvn/wrapper/maven-wrapper.properties" | ConvertFrom-StringData).distributionSha256Sum
125 | if ($distributionSha256Sum) {
126 |   if ($USE_MVND) {
127 |     Write-Error "Checksum validation is not supported for maven-mvnd. `nPlease disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties."
128 |   }
129 |   Import-Module $PSHOME\Modules\Microsoft.PowerShell.Utility -Function Get-FileHash
130 |   if ((Get-FileHash "$TMP_DOWNLOAD_DIR/$distributionUrlName" -Algorithm SHA256).Hash.ToLower() -ne $distributionSha256Sum) {
131 |     Write-Error "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised. If you updated your Maven version, you need to update the specified distributionSha256Sum property."
132 |   }
133 | }
134 | 
135 | # unzip and move
136 | Expand-Archive "$TMP_DOWNLOAD_DIR/$distributionUrlName" -DestinationPath "$TMP_DOWNLOAD_DIR" | Out-Null
137 | Rename-Item -Path "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" -NewName $MAVEN_HOME_NAME | Out-Null
138 | try {
139 |   Move-Item -Path "$TMP_DOWNLOAD_DIR/$MAVEN_HOME_NAME" -Destination $MAVEN_HOME_PARENT | Out-Null
140 | } catch {
141 |   if (! (Test-Path -Path "$MAVEN_HOME" -PathType Container)) {
142 |     Write-Error "fail to move MAVEN_HOME"
143 |   }
144 | } finally {
145 |   try { Remove-Item $TMP_DOWNLOAD_DIR -Recurse -Force | Out-Null }
146 |   catch { Write-Warning "Cannot remove $TMP_DOWNLOAD_DIR" }
147 | }
148 | 
149 | Write-Output "MVN_CMD=$MAVEN_HOME/bin/$MVN_CMD"
150 | 


--------------------------------------------------------------------------------
/mvnw:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | # ----------------------------------------------------------------------------
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #    https://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | # ----------------------------------------------------------------------------
 20 | 
 21 | # ----------------------------------------------------------------------------
 22 | # Apache Maven Wrapper startup batch script, version 3.3.2
 23 | #
 24 | # Optional ENV vars
 25 | # -----------------
 26 | #   JAVA_HOME - location of a JDK home dir, required when download maven via java source
 27 | #   MVNW_REPOURL - repo url base for downloading maven distribution
 28 | #   MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven
 29 | #   MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output
 30 | # ----------------------------------------------------------------------------
 31 | 
 32 | set -euf
 33 | [ "${MVNW_VERBOSE-}" != debug ] || set -x
 34 | 
 35 | # OS specific support.
 36 | native_path() { printf %s\\n "$1"; }
 37 | case "$(uname)" in
 38 | CYGWIN* | MINGW*)
 39 |   [ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")"
 40 |   native_path() { cygpath --path --windows "$1"; }
 41 |   ;;
 42 | esac
 43 | 
 44 | # set JAVACMD and JAVACCMD
 45 | set_java_home() {
 46 |   # For Cygwin and MinGW, ensure paths are in Unix format before anything is touched
 47 |   if [ -n "${JAVA_HOME-}" ]; then
 48 |     if [ -x "$JAVA_HOME/jre/sh/java" ]; then
 49 |       # IBM's JDK on AIX uses strange locations for the executables
 50 |       JAVACMD="$JAVA_HOME/jre/sh/java"
 51 |       JAVACCMD="$JAVA_HOME/jre/sh/javac"
 52 |     else
 53 |       JAVACMD="$JAVA_HOME/bin/java"
 54 |       JAVACCMD="$JAVA_HOME/bin/javac"
 55 | 
 56 |       if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then
 57 |         echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2
 58 |         echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2
 59 |         return 1
 60 |       fi
 61 |     fi
 62 |   else
 63 |     JAVACMD="$(
 64 |       'set' +e
 65 |       'unset' -f command 2>/dev/null
 66 |       'command' -v java
 67 |     )" || :
 68 |     JAVACCMD="$(
 69 |       'set' +e
 70 |       'unset' -f command 2>/dev/null
 71 |       'command' -v javac
 72 |     )" || :
 73 | 
 74 |     if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then
 75 |       echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2
 76 |       return 1
 77 |     fi
 78 |   fi
 79 | }
 80 | 
 81 | # hash string like Java String::hashCode
 82 | hash_string() {
 83 |   str="${1:-}" h=0
 84 |   while [ -n "$str" ]; do
 85 |     char="${str%"${str#?}"}"
 86 |     h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296))
 87 |     str="${str#?}"
 88 |   done
 89 |   printf %x\\n $h
 90 | }
 91 | 
 92 | verbose() { :; }
 93 | [ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; }
 94 | 
 95 | die() {
 96 |   printf %s\\n "$1" >&2
 97 |   exit 1
 98 | }
 99 | 
100 | trim() {
101 |   # MWRAPPER-139:
102 |   #   Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds.
103 |   #   Needed for removing poorly interpreted newline sequences when running in more
104 |   #   exotic environments such as mingw bash on Windows.
105 |   printf "%s" "${1}" | tr -d '[:space:]'
106 | }
107 | 
108 | # parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties
109 | while IFS="=" read -r key value; do
110 |   case "${key-}" in
111 |   distributionUrl) distributionUrl=$(trim "${value-}") ;;
112 |   distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;;
113 |   esac
114 | done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties"
115 | [ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties"
116 | 
117 | case "${distributionUrl##*/}" in
118 | maven-mvnd-*bin.*)
119 |   MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/
120 |   case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in
121 |   *AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;;
122 |   :Darwin*x86_64) distributionPlatform=darwin-amd64 ;;
123 |   :Darwin*arm64) distributionPlatform=darwin-aarch64 ;;
124 |   :Linux*x86_64*) distributionPlatform=linux-amd64 ;;
125 |   *)
126 |     echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2
127 |     distributionPlatform=linux-amd64
128 |     ;;
129 |   esac
130 |   distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip"
131 |   ;;
132 | maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;;
133 | *) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;;
134 | esac
135 | 
136 | # apply MVNW_REPOURL and calculate MAVEN_HOME
137 | # maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash>
138 | [ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}"
139 | distributionUrlName="${distributionUrl##*/}"
140 | distributionUrlNameMain="${distributionUrlName%.*}"
141 | distributionUrlNameMain="${distributionUrlNameMain%-bin}"
142 | MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}"
143 | MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")"
144 | 
145 | exec_maven() {
146 |   unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || :
147 |   exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD"
148 | }
149 | 
150 | if [ -d "$MAVEN_HOME" ]; then
151 |   verbose "found existing MAVEN_HOME at $MAVEN_HOME"
152 |   exec_maven "$@"
153 | fi
154 | 
155 | case "${distributionUrl-}" in
156 | *?-bin.zip | *?maven-mvnd-?*-?*.zip) ;;
157 | *) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;;
158 | esac
159 | 
160 | # prepare tmp dir
161 | if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then
162 |   clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; }
163 |   trap clean HUP INT TERM EXIT
164 | else
165 |   die "cannot create temp dir"
166 | fi
167 | 
168 | mkdir -p -- "${MAVEN_HOME%/*}"
169 | 
170 | # Download and Install Apache Maven
171 | verbose "Couldn't find MAVEN_HOME, downloading and installing it ..."
172 | verbose "Downloading from: $distributionUrl"
173 | verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName"
174 | 
175 | # select .zip or .tar.gz
176 | if ! command -v unzip >/dev/null; then
177 |   distributionUrl="${distributionUrl%.zip}.tar.gz"
178 |   distributionUrlName="${distributionUrl##*/}"
179 | fi
180 | 
181 | # verbose opt
182 | __MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR=''
183 | [ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v
184 | 
185 | # normalize http auth
186 | case "${MVNW_PASSWORD:+has-password}" in
187 | '') MVNW_USERNAME='' MVNW_PASSWORD='' ;;
188 | has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;;
189 | esac
190 | 
191 | if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then
192 |   verbose "Found wget ... using wget"
193 |   wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl"
194 | elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then
195 |   verbose "Found curl ... using curl"
196 |   curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl"
197 | elif set_java_home; then
198 |   verbose "Falling back to use Java to download"
199 |   javaSource="$TMP_DOWNLOAD_DIR/Downloader.java"
200 |   targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName"
201 |   cat >"$javaSource" <<-END
202 | 	public class Downloader extends java.net.Authenticator
203 | 	{
204 | 	  protected java.net.PasswordAuthentication getPasswordAuthentication()
205 | 	  {
206 | 	    return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() );
207 | 	  }
208 | 	  public static void main( String[] args ) throws Exception
209 | 	  {
210 | 	    setDefault( new Downloader() );
211 | 	    java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() );
212 | 	  }
213 | 	}
214 | 	END
215 |   # For Cygwin/MinGW, switch paths to Windows format before running javac and java
216 |   verbose " - Compiling Downloader.java ..."
217 |   "$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java"
218 |   verbose " - Running Downloader.java ..."
219 |   "$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")"
220 | fi
221 | 
222 | # If specified, validate the SHA-256 sum of the Maven distribution zip file
223 | if [ -n "${distributionSha256Sum-}" ]; then
224 |   distributionSha256Result=false
225 |   if [ "$MVN_CMD" = mvnd.sh ]; then
226 |     echo "Checksum validation is not supported for maven-mvnd." >&2
227 |     echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
228 |     exit 1
229 |   elif command -v sha256sum >/dev/null; then
230 |     if echo "$distributionSha256Sum  $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then
231 |       distributionSha256Result=true
232 |     fi
233 |   elif command -v shasum >/dev/null; then
234 |     if echo "$distributionSha256Sum  $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then
235 |       distributionSha256Result=true
236 |     fi
237 |   else
238 |     echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2
239 |     echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
240 |     exit 1
241 |   fi
242 |   if [ $distributionSha256Result = false ]; then
243 |     echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2
244 |     echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2
245 |     exit 1
246 |   fi
247 | fi
248 | 
249 | # unzip and move
250 | if command -v unzip >/dev/null; then
251 |   unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip"
252 | else
253 |   tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar"
254 | fi
255 | printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url"
256 | mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME"
257 | 
258 | clean || :
259 | exec_maven "$@"
260 | 


--------------------------------------------------------------------------------
/src/main/resources/docs/file.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"info": {
  3 | 		"_postman_id": "732f0f05-b9c6-4763-b636-e4dc83c5cd11",
  4 | 		"name": "car-dealer",
  5 | 		"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json",
  6 | 		"_exporter_id": "13628794",
  7 | 		"_collection_link": "https://grey-water-607719.postman.co/workspace/My-Workspace~7c8e265d-3378-4686-836d-e08a8569baca/collection/13628794-732f0f05-b9c6-4763-b636-e4dc83c5cd11?action=share&source=collection_link&creator=13628794"
  8 | 	},
  9 | 	"item": [
 10 | 		{
 11 | 			"name": "auth",
 12 | 			"item": [
 13 | 				{
 14 | 					"name": "Register user",
 15 | 					"request": {
 16 | 						"method": "POST",
 17 | 						"header": [],
 18 | 						"body": {
 19 | 							"mode": "raw",
 20 | 							"raw": "{   \r\n    \"fullName\": \"xullaaa\",\r\n    \"phoneNumber\": \"+9647711500575\",\r\n    \"password\":\"HeLlo@12345@dD\",\r\n    \"confirmPassword\": \"HeLlo@12345@dD\"\r\n}",
 21 | 							"options": {
 22 | 								"raw": {
 23 | 									"language": "json"
 24 | 								}
 25 | 							}
 26 | 						},
 27 | 						"url": {
 28 | 							"raw": "http://localhost:3000/api/v1/auth/register",
 29 | 							"protocol": "http",
 30 | 							"host": [
 31 | 								"localhost"
 32 | 							],
 33 | 							"port": "3000",
 34 | 							"path": [
 35 | 								"api",
 36 | 								"v1",
 37 | 								"auth",
 38 | 								"register"
 39 | 							]
 40 | 						}
 41 | 					},
 42 | 					"response": []
 43 | 				},
 44 | 				{
 45 | 					"name": "login user",
 46 | 					"request": {
 47 | 						"method": "POST",
 48 | 						"header": [],
 49 | 						"body": {
 50 | 							"mode": "raw",
 51 | 							"raw": "{   \r\n    \"phoneNumber\": \"+9647711500575\",\r\n    \"password\":\"HeLlo@12345@dD\"\r\n}",
 52 | 							"options": {
 53 | 								"raw": {
 54 | 									"language": "json"
 55 | 								}
 56 | 							}
 57 | 						},
 58 | 						"url": {
 59 | 							"raw": "http://localhost:3000/api/v1/auth/login",
 60 | 							"protocol": "http",
 61 | 							"host": [
 62 | 								"localhost"
 63 | 							],
 64 | 							"port": "3000",
 65 | 							"path": [
 66 | 								"api",
 67 | 								"v1",
 68 | 								"auth",
 69 | 								"login"
 70 | 							]
 71 | 						}
 72 | 					},
 73 | 					"response": []
 74 | 				}
 75 | 			]
 76 | 		},
 77 | 		{
 78 | 			"name": "showroom",
 79 | 			"item": [
 80 | 				{
 81 | 					"name": "Create showroom",
 82 | 					"request": {
 83 | 						"method": "POST",
 84 | 						"header": [],
 85 | 						"body": {
 86 | 							"mode": "raw",
 87 | 							"raw": "{\r\n      \"name\": \"2dsad2\",\r\n      \"phoneNumber\": \"+9647711500574\",\r\n      \"email\": \"ao@gmail.com\",\r\n      \"website\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"facebookLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"instagramLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"tiktokLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"snapchatLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"cityId\": \"kjh6jdt9e8clqo2bflfujk1n\",\r\n      \"street\": \"street\",\r\n      \"profilePicture\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"type\": \"SHOWROOM\",\r\n      \"latitude\": 321,\r\n      \"longitude\": 2321\r\n}",
 88 | 							"options": {
 89 | 								"raw": {
 90 | 									"language": "json"
 91 | 								}
 92 | 							}
 93 | 						},
 94 | 						"url": {
 95 | 							"raw": "http://localhost:3000/api/v1/showrooms",
 96 | 							"protocol": "http",
 97 | 							"host": [
 98 | 								"localhost"
 99 | 							],
100 | 							"port": "3000",
101 | 							"path": [
102 | 								"api",
103 | 								"v1",
104 | 								"showrooms"
105 | 							]
106 | 						}
107 | 					},
108 | 					"response": []
109 | 				},
110 | 				{
111 | 					"name": "Update showroom",
112 | 					"request": {
113 | 						"method": "PUT",
114 | 						"header": [],
115 | 						"body": {
116 | 							"mode": "raw",
117 | 							"raw": "{\r\n      \"name\": \"xula22\",\r\n      \"phoneNumber\": \"+9647711500576\",\r\n      \"email\": \"ao@gmail.com\",\r\n      \"website\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"facebookLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"instagramLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"tiktokLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"snapchatLink\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"governorate\": \"governorate\",\r\n      \"cityId\": \"kjh6jdt9e8clqo2bflfujk1n\",\r\n      \"street\": \"street2\",\r\n      \"profilePicture\": \"http://localhost:3000/api/v1/showrooms\",\r\n      \"type\": \"DEALERSHIP\",\r\n      \"latitude\": 321,\r\n      \"longitude\": 2321\r\n}",
118 | 							"options": {
119 | 								"raw": {
120 | 									"language": "json"
121 | 								}
122 | 							}
123 | 						},
124 | 						"url": {
125 | 							"raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25",
126 | 							"protocol": "http",
127 | 							"host": [
128 | 								"localhost"
129 | 							],
130 | 							"port": "3000",
131 | 							"path": [
132 | 								"api",
133 | 								"v1",
134 | 								"showrooms",
135 | 								"b16xfrg4xy1o3bpkoocftd25"
136 | 							]
137 | 						}
138 | 					},
139 | 					"response": []
140 | 				},
141 | 				{
142 | 					"name": "Delete showroom",
143 | 					"request": {
144 | 						"auth": {
145 | 							"type": "bearer",
146 | 							"bearer": [
147 | 								{
148 | 									"key": "token",
149 | 									"value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6ImdycnpmaGVtZTJsZHIxbThua3diNmp6aCIsImlhdCI6MTcxMDk4MTcxNiwiZXhwIjoxNzEwOTg1MzE2fQ.xWp52J3VcL1GN53J-vWutFBd5W1hvZo4hi1fO9MYGJ8",
150 | 									"type": "string"
151 | 								}
152 | 							]
153 | 						},
154 | 						"method": "DELETE",
155 | 						"header": [],
156 | 						"url": {
157 | 							"raw": "http://localhost:3000/api/v1/showrooms/pni90mnecrgfn0r5jyp5x1dv",
158 | 							"protocol": "http",
159 | 							"host": [
160 | 								"localhost"
161 | 							],
162 | 							"port": "3000",
163 | 							"path": [
164 | 								"api",
165 | 								"v1",
166 | 								"showrooms",
167 | 								"pni90mnecrgfn0r5jyp5x1dv"
168 | 							]
169 | 						}
170 | 					},
171 | 					"response": []
172 | 				},
173 | 				{
174 | 					"name": "My showrooms",
175 | 					"request": {
176 | 						"auth": {
177 | 							"type": "bearer",
178 | 							"bearer": [
179 | 								{
180 | 									"key": "token",
181 | 									"value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6ImM1ZWZnbzJxMmJjNXBtYmNhaXJ3bmlvdiIsInBob25lTnVtYmVyIjoiKzk2NDc3MTE1MDA1NzQiLCJmdWxsTmFtZSI6Inh1bGxhYWEiLCJsZXZlbCI6IlVTRVIiLCJpYXQiOjE3MTI2MDg3ODEsImV4cCI6MTcxMjYxMjM4MX0.P2GzJSx_qzYtNmrdzXCy_lM4dtbM7o0Z2H6psArsb6g",
182 | 									"type": "string"
183 | 								}
184 | 							]
185 | 						},
186 | 						"method": "GET",
187 | 						"header": [],
188 | 						"url": {
189 | 							"raw": "http://localhost:3000/api/v1/showrooms/me/all",
190 | 							"protocol": "http",
191 | 							"host": [
192 | 								"localhost"
193 | 							],
194 | 							"port": "3000",
195 | 							"path": [
196 | 								"api",
197 | 								"v1",
198 | 								"showrooms",
199 | 								"me",
200 | 								"all"
201 | 							]
202 | 						}
203 | 					},
204 | 					"response": []
205 | 				},
206 | 				{
207 | 					"name": "Get All showrooms",
208 | 					"request": {
209 | 						"method": "GET",
210 | 						"header": [],
211 | 						"url": {
212 | 							"raw": "http://localhost:3000/api/v1/showrooms",
213 | 							"protocol": "http",
214 | 							"host": [
215 | 								"localhost"
216 | 							],
217 | 							"port": "3000",
218 | 							"path": [
219 | 								"api",
220 | 								"v1",
221 | 								"showrooms"
222 | 							]
223 | 						}
224 | 					},
225 | 					"response": []
226 | 				},
227 | 				{
228 | 					"name": "Get a single showroom",
229 | 					"request": {
230 | 						"method": "GET",
231 | 						"header": [],
232 | 						"url": {
233 | 							"raw": "http://localhost:3000/api/v1/showrooms/ic8nq13o5ns9h41cdbsnoh3z",
234 | 							"protocol": "http",
235 | 							"host": [
236 | 								"localhost"
237 | 							],
238 | 							"port": "3000",
239 | 							"path": [
240 | 								"api",
241 | 								"v1",
242 | 								"showrooms",
243 | 								"ic8nq13o5ns9h41cdbsnoh3z"
244 | 							]
245 | 						}
246 | 					},
247 | 					"response": []
248 | 				}
249 | 			]
250 | 		},
251 | 		{
252 | 			"name": "admin",
253 | 			"item": [
254 | 				{
255 | 					"name": "metadata",
256 | 					"item": [
257 | 						{
258 | 							"name": "create city",
259 | 							"request": {
260 | 								"auth": {
261 | 									"type": "bearer",
262 | 									"bearer": [
263 | 										{
264 | 											"key": "token",
265 | 											"value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6ImM1ZWZnbzJxMmJjNXBtYmNhaXJ3bmlvdiIsInBob25lTnVtYmVyIjoiKzk2NDc3MTE1MDA1NzQiLCJmdWxsTmFtZSI6Inh1bGxhYWEiLCJsZXZlbCI6IlVTRVIiLCJpYXQiOjE3MTE4MzA2ODcsImV4cCI6MTcxMTgzNDI4N30.n0vux5Hr_cTwbl9Ps1P2xea9CHVm0oc5WQK35KWA_KA",
266 | 											"type": "string"
267 | 										}
268 | 									]
269 | 								},
270 | 								"method": "POST",
271 | 								"header": [],
272 | 								"body": {
273 | 									"mode": "raw",
274 | 									"raw": "{\r\n    \"name\": \"kalar\",\r\n    \"governorateId\": \"ilo29oos241vw755fi6oomnq\"\r\n}",
275 | 									"options": {
276 | 										"raw": {
277 | 											"language": "json"
278 | 										}
279 | 									}
280 | 								},
281 | 								"url": {
282 | 									"raw": "http://localhost:3000/api/v1/admin/metadata/city",
283 | 									"protocol": "http",
284 | 									"host": [
285 | 										"localhost"
286 | 									],
287 | 									"port": "3000",
288 | 									"path": [
289 | 										"api",
290 | 										"v1",
291 | 										"admin",
292 | 										"metadata",
293 | 										"city"
294 | 									]
295 | 								}
296 | 							},
297 | 							"response": []
298 | 						},
299 | 						{
300 | 							"name": "Create governorate",
301 | 							"request": {
302 | 								"method": "POST",
303 | 								"header": [],
304 | 								"body": {
305 | 									"mode": "raw",
306 | 									"raw": "{\r\n    \"name\": \"sulaimaniyah\"\r\n}",
307 | 									"options": {
308 | 										"raw": {
309 | 											"language": "json"
310 | 										}
311 | 									}
312 | 								},
313 | 								"url": {
314 | 									"raw": "http://localhost:3000/api/v1/admin/metadata/governorate",
315 | 									"protocol": "http",
316 | 									"host": [
317 | 										"localhost"
318 | 									],
319 | 									"port": "3000",
320 | 									"path": [
321 | 										"api",
322 | 										"v1",
323 | 										"admin",
324 | 										"metadata",
325 | 										"governorate"
326 | 									]
327 | 								}
328 | 							},
329 | 							"response": []
330 | 						}
331 | 					]
332 | 				}
333 | 			]
334 | 		},
335 | 		{
336 | 			"name": "public",
337 | 			"item": [
338 | 				{
339 | 					"name": "metadata",
340 | 					"item": [
341 | 						{
342 | 							"name": "Get all cities",
343 | 							"request": {
344 | 								"method": "GET",
345 | 								"header": [],
346 | 								"url": {
347 | 									"raw": "http://localhost:3000/api/v1/public/metadata/cities",
348 | 									"protocol": "http",
349 | 									"host": [
350 | 										"localhost"
351 | 									],
352 | 									"port": "3000",
353 | 									"path": [
354 | 										"api",
355 | 										"v1",
356 | 										"public",
357 | 										"metadata",
358 | 										"cities"
359 | 									]
360 | 								}
361 | 							},
362 | 							"response": []
363 | 						},
364 | 						{
365 | 							"name": "Get All governorates",
366 | 							"request": {
367 | 								"method": "GET",
368 | 								"header": [],
369 | 								"url": {
370 | 									"raw": "http://localhost:3000/api/v1/public/metadata/governorates",
371 | 									"protocol": "http",
372 | 									"host": [
373 | 										"localhost"
374 | 									],
375 | 									"port": "3000",
376 | 									"path": [
377 | 										"api",
378 | 										"v1",
379 | 										"public",
380 | 										"metadata",
381 | 										"governorates"
382 | 									]
383 | 								}
384 | 							},
385 | 							"response": []
386 | 						}
387 | 					]
388 | 				}
389 | 			]
390 | 		},
391 | 		{
392 | 			"name": "car",
393 | 			"item": [
394 | 				{
395 | 					"name": "save car",
396 | 					"request": {
397 | 						"method": "POST",
398 | 						"header": [],
399 | 						"body": {
400 | 							"mode": "raw",
401 | 							"raw": "{\r\n    \"brand\" : \"Toyota\",\r\n    \"model\" : \"Camry\",\r\n    \"year\" : 2023,\r\n    \"trim\" : \"SE+\",\r\n    \"transmission\" : \"gasoline\",\r\n    \"color\": \"red\",\r\n    \"fuel\" : \"gasoline\",\r\n    \"plateType\" : \"vehicle\",\r\n    \"plateCityId\": \"kjh6jdt9e8clqo2bflfujk1n\",\r\n    \"importCountry\": \"usa\",\r\n    \"price\" : 250000,\r\n    \"currency\" : \"USD\",\r\n    \"priceHidden\" : false,\r\n    \"phoneNumber\" : \"+9647711500573\",\r\n    \"inspectionDocumentLink\": \"https://google.com\",\r\n    \"damages\":\"none\",\r\n    \"damageType\": \"Clean\"\r\n}",
402 | 							"options": {
403 | 								"raw": {
404 | 									"language": "json"
405 | 								}
406 | 							}
407 | 						},
408 | 						"url": {
409 | 							"raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25/cars",
410 | 							"protocol": "http",
411 | 							"host": [
412 | 								"localhost"
413 | 							],
414 | 							"port": "3000",
415 | 							"path": [
416 | 								"api",
417 | 								"v1",
418 | 								"showrooms",
419 | 								"b16xfrg4xy1o3bpkoocftd25",
420 | 								"cars"
421 | 							]
422 | 						}
423 | 					},
424 | 					"response": []
425 | 				},
426 | 				{
427 | 					"name": "all my cars",
428 | 					"request": {
429 | 						"method": "GET",
430 | 						"header": [],
431 | 						"url": {
432 | 							"raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25/cars/me/all",
433 | 							"protocol": "http",
434 | 							"host": [
435 | 								"localhost"
436 | 							],
437 | 							"port": "3000",
438 | 							"path": [
439 | 								"api",
440 | 								"v1",
441 | 								"showrooms",
442 | 								"b16xfrg4xy1o3bpkoocftd25",
443 | 								"cars",
444 | 								"me",
445 | 								"all"
446 | 							]
447 | 						}
448 | 					},
449 | 					"response": []
450 | 				},
451 | 				{
452 | 					"name": "get my car by id",
453 | 					"request": {
454 | 						"method": "GET",
455 | 						"header": [],
456 | 						"url": {
457 | 							"raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25/cars/qz1t6580ogo8f1kc0jo4rram",
458 | 							"protocol": "http",
459 | 							"host": [
460 | 								"localhost"
461 | 							],
462 | 							"port": "3000",
463 | 							"path": [
464 | 								"api",
465 | 								"v1",
466 | 								"showrooms",
467 | 								"b16xfrg4xy1o3bpkoocftd25",
468 | 								"cars",
469 | 								"qz1t6580ogo8f1kc0jo4rram"
470 | 							]
471 | 						}
472 | 					},
473 | 					"response": []
474 | 				},
475 | 				{
476 | 					"name": "delete a car from a show room",
477 | 					"request": {
478 | 						"method": "DELETE",
479 | 						"header": [],
480 | 						"url": {
481 | 							"raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25/cars/qz1t6580ogo8f1kc0jo4rram",
482 | 							"protocol": "http",
483 | 							"host": [
484 | 								"localhost"
485 | 							],
486 | 							"port": "3000",
487 | 							"path": [
488 | 								"api",
489 | 								"v1",
490 | 								"showrooms",
491 | 								"b16xfrg4xy1o3bpkoocftd25",
492 | 								"cars",
493 | 								"qz1t6580ogo8f1kc0jo4rram"
494 | 							]
495 | 						}
496 | 					},
497 | 					"response": []
498 | 				},
499 | 				{
500 | 					"name": "upload car img",
501 | 					"request": {
502 | 						"method": "POST",
503 | 						"header": [],
504 | 						"body": {
505 | 							"mode": "formdata",
506 | 							"formdata": [
507 | 								{
508 | 									"key": "image",
509 | 									"type": "file",
510 | 									"src": "/C:/Users/aland/Downloads/Stamp.JPG"
511 | 								}
512 | 							]
513 | 						},
514 | 						"url": {
515 | 							"raw": "http://localhost:3000/api/v1/showrooms/b16xfrg4xy1o3bpkoocftd25/cars/hg91wrvv52y76vnn0h30461p/images",
516 | 							"protocol": "http",
517 | 							"host": [
518 | 								"localhost"
519 | 							],
520 | 							"port": "3000",
521 | 							"path": [
522 | 								"api",
523 | 								"v1",
524 | 								"showrooms",
525 | 								"b16xfrg4xy1o3bpkoocftd25",
526 | 								"cars",
527 | 								"hg91wrvv52y76vnn0h30461p",
528 | 								"images"
529 | 							]
530 | 						}
531 | 					},
532 | 					"response": []
533 | 				}
534 | 			]
535 | 		}
536 | 	],
537 | 	"auth": {
538 | 		"type": "bearer",
539 | 		"bearer": [
540 | 			{
541 | 				"key": "token",
542 | 				"value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6ImRwZWk1ZWkzN2d1aG52ZzQ2bjBiN3lwdSIsInBob25lTnVtYmVyIjoiKzk2NDc3MTE1MDA1NzUiLCJmdWxsTmFtZSI6Inh1bGxhYWEiLCJsZXZlbCI6IlVTRVIiLCJpYXQiOjE3MTQ5Mjg4MzUsImV4cCI6MTcxNDkzMjQzNX0.Pj9uk_wJsPE21R_tg0C16XZjFnJgsI2lbAaX6yPD3BQ",
543 | 				"type": "string"
544 | 			}
545 | 		]
546 | 	},
547 | 	"event": [
548 | 		{
549 | 			"listen": "prerequest",
550 | 			"script": {
551 | 				"type": "text/javascript",
552 | 				"packages": {},
553 | 				"exec": [
554 | 					""
555 | 				]
556 | 			}
557 | 		},
558 | 		{
559 | 			"listen": "test",
560 | 			"script": {
561 | 				"type": "text/javascript",
562 | 				"packages": {},
563 | 				"exec": [
564 | 					""
565 | 				]
566 | 			}
567 | 		}
568 | 	]
569 | }


--------------------------------------------------------------------------------
/src/main/resources/docs/initial-design.md:
--------------------------------------------------------------------------------
  1 | # ragscan initial design doc
  2 | 
  3 | ## Objective
  4 | 
  5 | The overall goal of this project is to test the application of Retreival Augmented Generation (RAG) to the reconnaissance phase of pentesting. RAG has shown
  6 | to be effective in giving Large Language Models (LLM's) dynamic information in which it is not trained on for use to answer the users prompts and instructions.
  7 | 
  8 | ## Overview
  9 | This document will show the high level design of the initial approach taken for this tool.
 10 | 
 11 | ## Processing of File and Document Input
 12 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/f56a6f25-44bc-4d77-88f5-318fdfd0865c)
 13 | 
 14 | ### Scrape Web Content
 15 | As of now, there is not any plans to add any tooling that will do the actual scraping of web content as it would put less focus on the core functionality
 16 | of this project and there are other tools that can do this much better and have had more project maturity. Many of these tools can be found in https://github.com/BruceDone/awesome-crawler.
 17 | 
 18 | ### Input/Scraped Web Content 
 19 | The expected input to the tool are any pages, code, or information that was gathered through other tools and are stored in a directory in their original files and file 
 20 | structure. This ensures that there is tracking as to where information that the LLM processes is from and is easily identifiable for the user if they wish to manually inspect
 21 | the identified file themselves after tool usage.
 22 | 
 23 | ### Initial Loading of Content
 24 | On execution of the tool, it will load all of the files in the specified directory, open them, convert them into standardized document objects that a vector
 25 | database will be able to identify and use.
 26 | 
 27 | ### Document to Embeddings Conversion
 28 | After each of the files are loaded as documents, they can be converted into [embeddings](https://www.cloudflare.com/learning/ai/what-are-embeddings/) which is a unique
 29 | representation of that specific object that can be used for more complex processing. This conversion is done through the use of [text embedding models](https://medium.com/@minh.hoque/text-embedding-models-an-insightful-dive-759ea53576f5) thats sole purpose is to create the vectorized representation of chunks of text.
 30 | 
 31 | ### Storing the Embeddings
 32 | After all of the previous processing is done, the generated embeddings can be stored within a [vector database](https://aws.amazon.com/what-is/vector-databases/) that's
 33 | specialized in the ability to detect the similarity of object and content based on the space between their vector point location. This is an important prerequisite to the
 34 | success of this RAG approach where there can large amounts of documents that together would be larger than the token limit of current LLM's. 
 35 | 
 36 | ## Standard Flow on Tool Usage on User Prompt Input
 37 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/472ee53d-7ccb-4f96-bd56-d1e81bd58c05)
 38 | 
 39 | ### User Prompt
 40 | The user prompt is the other input to this tool that is highly responsible for the output in which the vector database returns and for the LLM to process. An example prompt that
 41 | a pentester may have when having a large set of documents are are things like "is there an admin login page." or "what is the version of wordpress that this website is running on?".
 42 | After this user prompt is gathered, the tool can start it execution
 43 | 
 44 | ### User Prompt Input Conversion to Embedding
 45 | Because the documents gathered are now stored in the vector database alongside their embeddings, we need to convert the user input prompt into an embedding as well to calculate
 46 | similarity. This is done through the use of an embedding model just as explained above with the conversion of the documents into embeddings
 47 | 
 48 | ### Semantic Search Using Embeddings
 49 | Once the user prompt input is converted into embeddings, it is now possible to use [semantic search](https://www.elastic.co/what-is/semantic-search) to find the relevant documents that are most similar to the user input prompt.
 50 | 
 51 | ### Semantic Search Results
 52 | Once the semantic search has been completed using the embeddings, the output should be the most relevant documents in relation to the users input prompt that are ranked on the
 53 | calculated relevancy to it. For example, if a user is asking for if there is an admin login page, the top result would most likely be the admin login page document, with lower
 54 | ranked results possibly being other login pages or mentions of admins.
 55 | 
 56 | ### Construct the Prompt to Feed to the LLM
 57 | After gathering the most probable content that matches a users input, we can not just feed that directly to the LLM without any other context as it would not know exactly what the
 58 | user wants. There are two other parts that will need to be added to the prompt to the LLM, one being the user input prompt, and another being a system prompt. The user input prompt
 59 | is to guide the LLM to do the exact task that the user would like, such as identifying if there is an admin page. The system prompt is used to guide the LLM to answer in a specific
 60 | manner and to understand the context that the user is conducting a pentest and may want a more concise response. Putting these three things together creates the full prompt that will
 61 | be fed to the LLM.
 62 | 
 63 | ### Feeding Prompt to LLM
 64 | The constructed prompt is then fed to the LLM for it to summarize and provide specific context to the user. Depending on the system prompt given to the LLM, its response could be just the fact that it identified a possible result, or it can go further and explain why it things that it is or is not what the user is looking for. This is the most unpredictable step but
 65 | highly valuable given that the prompt is correct and no alignment in the LLM renders the results unusable.
 66 | 
 67 | ### Return the Output to the User
 68 | After the LLM has done what it was tasked with doing, the generated response should return to the user alongside the location or identifier of the files used as input as part of the prompt to the LLM. This could be the location of the admin document in the directory that was used to feed documents into the vector database, or even more verbose identifiers such as line numbers or file hashes. This output should give the output that the user was looking for and making the recon process much easier.
 69 | 
 70 | 
 71 | # ragscan initial design doc
 72 | 
 73 | ## Objective
 74 | 
 75 | The overall goal of this project is to test the application of Retreival Augmented Generation (RAG) to the reconnaissance phase of pentesting. RAG has shown
 76 | to be effective in giving Large Language Models (LLM's) dynamic information in which it is not trained on for use to answer the users prompts and instructions.
 77 | 
 78 | ## Overview
 79 | This document will show the high level design of the initial approach taken for this tool.
 80 | 
 81 | ## Processing of File and Document Input
 82 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/f56a6f25-44bc-4d77-88f5-318fdfd0865c)
 83 | 
 84 | ### Scrape Web Content
 85 | As of now, there is not any plans to add any tooling that will do the actual scraping of web content as it would put less focus on the core functionality
 86 | of this project and there are other tools that can do this much better and have had more project maturity. Many of these tools can be found in https://github.com/BruceDone/awesome-crawler.
 87 | 
 88 | ### Input/Scraped Web Content
 89 | The expected input to the tool are any pages, code, or information that was gathered through other tools and are stored in a directory in their original files and file
 90 | structure. This ensures that there is tracking as to where information that the LLM processes is from and is easily identifiable for the user if they wish to manually inspect
 91 | the identified file themselves after tool usage.
 92 | 
 93 | ### Initial Loading of Content
 94 | On execution of the tool, it will load all of the files in the specified directory, open them, convert them into standardized document objects that a vector
 95 | database will be able to identify and use.
 96 | 
 97 | ### Document to Embeddings Conversion
 98 | After each of the files are loaded as documents, they can be converted into [embeddings](https://www.cloudflare.com/learning/ai/what-are-embeddings/) which is a unique
 99 | representation of that specific object that can be used for more complex processing. This conversion is done through the use of [text embedding models](https://medium.com/@minh.hoque/text-embedding-models-an-insightful-dive-759ea53576f5) thats sole purpose is to create the vectorized representation of chunks of text.
100 | 
101 | ### Storing the Embeddings
102 | After all of the previous processing is done, the generated embeddings can be stored within a [vector database](https://aws.amazon.com/what-is/vector-databases/) that's
103 | specialized in the ability to detect the similarity of object and content based on the space between their vector point location. This is an important prerequisite to the
104 | success of this RAG approach where there can large amounts of documents that together would be larger than the token limit of current LLM's.
105 | 
106 | ## Standard Flow on Tool Usage on User Prompt Input
107 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/472ee53d-7ccb-4f96-bd56-d1e81bd58c05)
108 | 
109 | ### User Prompt
110 | The user prompt is the other input to this tool that is highly responsible for the output in which the vector database returns and for the LLM to process. An example prompt that
111 | a pentester may have when having a large set of documents are are things like "is there an admin login page." or "what is the version of wordpress that this website is running on?".
112 | After this user prompt is gathered, the tool can start it execution
113 | 
114 | ### User Prompt Input Conversion to Embedding
115 | Because the documents gathered are now stored in the vector database alongside their embeddings, we need to convert the user input prompt into an embedding as well to calculate
116 | similarity. This is done through the use of an embedding model just as explained above with the conversion of the documents into embeddings
117 | 
118 | ### Semantic Search Using Embeddings
119 | Once the user prompt input is converted into embeddings, it is now possible to use [semantic search](https://www.elastic.co/what-is/semantic-search) to find the relevant documents that are most similar to the user input prompt.
120 | 
121 | ### Semantic Search Results
122 | Once the semantic search has been completed using the embeddings, the output should be the most relevant documents in relation to the users input prompt that are ranked on the
123 | calculated relevancy to it. For example, if a user is asking for if there is an admin login page, the top result would most likely be the admin login page document, with lower
124 | ranked results possibly being other login pages or mentions of admins.
125 | 
126 | ### Construct the Prompt to Feed to the LLM
127 | After gathering the most probable content that matches a users input, we can not just feed that directly to the LLM without any other context as it would not know exactly what the
128 | user wants. There are two other parts that will need to be added to the prompt to the LLM, one being the user input prompt, and another being a system prompt. The user input prompt
129 | is to guide the LLM to do the exact task that the user would like, such as identifying if there is an admin page. The system prompt is used to guide the LLM to answer in a specific
130 | manner and to understand the context that the user is conducting a pentest and may want a more concise response. Putting these three things together creates the full prompt that will
131 | be fed to the LLM.
132 | 
133 | ### Feeding Prompt to LLM
134 | The constructed prompt is then fed to the LLM for it to summarize and provide specific context to the user. Depending on the system prompt given to the LLM, its response could be just the fact that it identified a possible result, or it can go further and explain why it things that it is or is not what the user is looking for. This is the most unpredictable step but
135 | highly valuable given that the prompt is correct and no alignment in the LLM renders the results unusable.
136 | 
137 | ### Return the Output to the User
138 | After the LLM has done what it was tasked with doing, the generated response should return to the user alongside the location or identifier of the files used as input as part of the prompt to the LLM. This could be the location of the admin document in the directory that was used to feed documents into the vector database, or even more verbose identifiers such as line numbers or file hashes. This output should give the output that the user was looking for and making the recon process much easier.
139 | 
140 | # ragscan initial design doc
141 | 
142 | ## Objective
143 | 
144 | The overall goal of this project is to test the application of Retreival Augmented Generation (RAG) to the reconnaissance phase of pentesting. RAG has shown
145 | to be effective in giving Large Language Models (LLM's) dynamic information in which it is not trained on for use to answer the users prompts and instructions.
146 | 
147 | ## Overview
148 | This document will show the high level design of the initial approach taken for this tool.
149 | 
150 | ## Processing of File and Document Input
151 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/f56a6f25-44bc-4d77-88f5-318fdfd0865c)
152 | 
153 | ### Scrape Web Content
154 | As of now, there is not any plans to add any tooling that will do the actual scraping of web content as it would put less focus on the core functionality
155 | of this project and there are other tools that can do this much better and have had more project maturity. Many of these tools can be found in https://github.com/BruceDone/awesome-crawler.
156 | 
157 | ### Input/Scraped Web Content
158 | The expected input to the tool are any pages, code, or information that was gathered through other tools and are stored in a directory in their original files and file
159 | structure. This ensures that there is tracking as to where information that the LLM processes is from and is easily identifiable for the user if they wish to manually inspect
160 | the identified file themselves after tool usage.
161 | 
162 | ### Initial Loading of Content
163 | On execution of the tool, it will load all of the files in the specified directory, open them, convert them into standardized document objects that a vector
164 | database will be able to identify and use.
165 | 
166 | ### Document to Embeddings Conversion
167 | After each of the files are loaded as documents, they can be converted into [embeddings](https://www.cloudflare.com/learning/ai/what-are-embeddings/) which is a unique
168 | representation of that specific object that can be used for more complex processing. This conversion is done through the use of [text embedding models](https://medium.com/@minh.hoque/text-embedding-models-an-insightful-dive-759ea53576f5) thats sole purpose is to create the vectorized representation of chunks of text.
169 | 
170 | ### Storing the Embeddings
171 | After all of the previous processing is done, the generated embeddings can be stored within a [vector database](https://aws.amazon.com/what-is/vector-databases/) that's
172 | specialized in the ability to detect the similarity of object and content based on the space between their vector point location. This is an important prerequisite to the
173 | success of this RAG approach where there can large amounts of documents that together would be larger than the token limit of current LLM's.
174 | 
175 | ## Standard Flow on Tool Usage on User Prompt Input
176 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/472ee53d-7ccb-4f96-bd56-d1e81bd58c05)
177 | 
178 | ### User Prompt
179 | The user prompt is the other input to this tool that is highly responsible for the output in which the vector database returns and for the LLM to process. An example prompt that
180 | a pentester may have when having a large set of documents are are things like "is there an admin login page." or "what is the version of wordpress that this website is running on?".
181 | After this user prompt is gathered, the tool can start it execution
182 | 
183 | ### User Prompt Input Conversion to Embedding
184 | Because the documents gathered are now stored in the vector database alongside their embeddings, we need to convert the user input prompt into an embedding as well to calculate
185 | similarity. This is done through the use of an embedding model just as explained above with the conversion of the documents into embeddings
186 | 
187 | ### Semantic Search Using Embeddings
188 | Once the user prompt input is converted into embeddings, it is now possible to use [semantic search](https://www.elastic.co/what-is/semantic-search) to find the relevant documents that are most similar to the user input prompt.
189 | 
190 | ### Semantic Search Results
191 | Once the semantic search has been completed using the embeddings, the output should be the most relevant documents in relation to the users input prompt that are ranked on the
192 | calculated relevancy to it. For example, if a user is asking for if there is an admin login page, the top result would most likely be the admin login page document, with lower
193 | ranked results possibly being other login pages or mentions of admins.
194 | 
195 | ### Construct the Prompt to Feed to the LLM
196 | After gathering the most probable content that matches a users input, we can not just feed that directly to the LLM without any other context as it would not know exactly what the
197 | user wants. There are two other parts that will need to be added to the prompt to the LLM, one being the user input prompt, and another being a system prompt. The user input prompt
198 | is to guide the LLM to do the exact task that the user would like, such as identifying if there is an admin page. The system prompt is used to guide the LLM to answer in a specific
199 | manner and to understand the context that the user is conducting a pentest and may want a more concise response. Putting these three things together creates the full prompt that will
200 | be fed to the LLM.
201 | 
202 | ### Feeding Prompt to LLM
203 | The constructed prompt is then fed to the LLM for it to summarize and provide specific context to the user. Depending on the system prompt given to the LLM, its response could be just the fact that it identified a possible result, or it can go further and explain why it things that it is or is not what the user is looking for. This is the most unpredictable step but
204 | highly valuable given that the prompt is correct and no alignment in the LLM renders the results unusable.
205 | 
206 | ### Return the Output to the User
207 | After the LLM has done what it was tasked with doing, the generated response should return to the user alongside the location or identifier of the files used as input as part of the prompt to the LLM. This could be the location of the admin document in the directory that was used to feed documents into the vector database, or even more verbose identifiers such as line numbers or file hashes. This output should give the output that the user was looking for and making the recon process much easier.
208 | 
209 | # ragscan initial design doc
210 | 
211 | ## Objective
212 | 
213 | The overall goal of this project is to test the application of Retreival Augmented Generation (RAG) to the reconnaissance phase of pentesting. RAG has shown
214 | to be effective in giving Large Language Models (LLM's) dynamic information in which it is not trained on for use to answer the users prompts and instructions.
215 | 
216 | ## Overview
217 | This document will show the high level design of the initial approach taken for this tool.
218 | 
219 | ## Processing of File and Document Input
220 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/f56a6f25-44bc-4d77-88f5-318fdfd0865c)
221 | 
222 | ### Scrape Web Content
223 | As of now, there is not any plans to add any tooling that will do the actual scraping of web content as it would put less focus on the core functionality
224 | of this project and there are other tools that can do this much better and have had more project maturity. Many of these tools can be found in https://github.com/BruceDone/awesome-crawler.
225 | 
226 | ### Input/Scraped Web Content
227 | The expected input to the tool are any pages, code, or information that was gathered through other tools and are stored in a directory in their original files and file
228 | structure. This ensures that there is tracking as to where information that the LLM processes is from and is easily identifiable for the user if they wish to manually inspect
229 | the identified file themselves after tool usage.
230 | 
231 | ### Initial Loading of Content
232 | On execution of the tool, it will load all of the files in the specified directory, open them, convert them into standardized document objects that a vector
233 | database will be able to identify and use.
234 | 
235 | ### Document to Embeddings Conversion
236 | After each of the files are loaded as documents, they can be converted into [embeddings](https://www.cloudflare.com/learning/ai/what-are-embeddings/) which is a unique
237 | representation of that specific object that can be used for more complex processing. This conversion is done through the use of [text embedding models](https://medium.com/@minh.hoque/text-embedding-models-an-insightful-dive-759ea53576f5) thats sole purpose is to create the vectorized representation of chunks of text.
238 | 
239 | ### Storing the Embeddings
240 | After all of the previous processing is done, the generated embeddings can be stored within a [vector database](https://aws.amazon.com/what-is/vector-databases/) that's
241 | specialized in the ability to detect the similarity of object and content based on the space between their vector point location. This is an important prerequisite to the
242 | success of this RAG approach where there can large amounts of documents that together would be larger than the token limit of current LLM's.
243 | 
244 | ## Standard Flow on Tool Usage on User Prompt Input
245 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/472ee53d-7ccb-4f96-bd56-d1e81bd58c05)
246 | 
247 | ### User Prompt
248 | The user prompt is the other input to this tool that is highly responsible for the output in which the vector database returns and for the LLM to process. An example prompt that
249 | a pentester may have when having a large set of documents are are things like "is there an admin login page." or "what is the version of wordpress that this website is running on?".
250 | After this user prompt is gathered, the tool can start it execution
251 | 
252 | ### User Prompt Input Conversion to Embedding
253 | Because the documents gathered are now stored in the vector database alongside their embeddings, we need to convert the user input prompt into an embedding as well to calculate
254 | similarity. This is done through the use of an embedding model just as explained above with the conversion of the documents into embeddings
255 | 
256 | ### Semantic Search Using Embeddings
257 | Once the user prompt input is converted into embeddings, it is now possible to use [semantic search](https://www.elastic.co/what-is/semantic-search) to find the relevant documents that are most similar to the user input prompt.
258 | 
259 | ### Semantic Search Results
260 | Once the semantic search has been completed using the embeddings, the output should be the most relevant documents in relation to the users input prompt that are ranked on the
261 | calculated relevancy to it. For example, if a user is asking for if there is an admin login page, the top result would most likely be the admin login page document, with lower
262 | ranked results possibly being other login pages or mentions of admins.
263 | 
264 | ### Construct the Prompt to Feed to the LLM
265 | After gathering the most probable content that matches a users input, we can not just feed that directly to the LLM without any other context as it would not know exactly what the
266 | user wants. There are two other parts that will need to be added to the prompt to the LLM, one being the user input prompt, and another being a system prompt. The user input prompt
267 | is to guide the LLM to do the exact task that the user would like, such as identifying if there is an admin page. The system prompt is used to guide the LLM to answer in a specific
268 | manner and to understand the context that the user is conducting a pentest and may want a more concise response. Putting these three things together creates the full prompt that will
269 | be fed to the LLM.
270 | 
271 | ### Feeding Prompt to LLM
272 | The constructed prompt is then fed to the LLM for it to summarize and provide specific context to the user. Depending on the system prompt given to the LLM, its response could be just the fact that it identified a possible result, or it can go further and explain why it things that it is or is not what the user is looking for. This is the most unpredictable step but
273 | highly valuable given that the prompt is correct and no alignment in the LLM renders the results unusable.
274 | 
275 | ### Return the Output to the User
276 | After the LLM has done what it was tasked with doing, the generated response should return to the user alongside the location or identifier of the files used as input as part of the prompt to the LLM. This could be the location of the admin document in the directory that was used to feed documents into the vector database, or even more verbose identifiers such as line numbers or file hashes. This output should give the output that the user was looking for and making the recon process much easier.
277 | 
278 | # ragscan initial design doc
279 | 
280 | ## Objective
281 | 
282 | The overall goal of this project is to test the application of Retreival Augmented Generation (RAG) to the reconnaissance phase of pentesting. RAG has shown
283 | to be effective in giving Large Language Models (LLM's) dynamic information in which it is not trained on for use to answer the users prompts and instructions.
284 | 
285 | ## Overview
286 | This document will show the high level design of the initial approach taken for this tool.
287 | 
288 | ## Processing of File and Document Input
289 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/f56a6f25-44bc-4d77-88f5-318fdfd0865c)
290 | 
291 | ### Scrape Web Content
292 | As of now, there is not any plans to add any tooling that will do the actual scraping of web content as it would put less focus on the core functionality
293 | of this project and there are other tools that can do this much better and have had more project maturity. Many of these tools can be found in https://github.com/BruceDone/awesome-crawler.
294 | 
295 | ### Input/Scraped Web Content
296 | The expected input to the tool are any pages, code, or information that was gathered through other tools and are stored in a directory in their original files and file
297 | structure. This ensures that there is tracking as to where information that the LLM processes is from and is easily identifiable for the user if they wish to manually inspect
298 | the identified file themselves after tool usage.
299 | 
300 | ### Initial Loading of Content
301 | On execution of the tool, it will load all of the files in the specified directory, open them, convert them into standardized document objects that a vector
302 | database will be able to identify and use.
303 | 
304 | ### Document to Embeddings Conversion
305 | After each of the files are loaded as documents, they can be converted into [embeddings](https://www.cloudflare.com/learning/ai/what-are-embeddings/) which is a unique
306 | representation of that specific object that can be used for more complex processing. This conversion is done through the use of [text embedding models](https://medium.com/@minh.hoque/text-embedding-models-an-insightful-dive-759ea53576f5) thats sole purpose is to create the vectorized representation of chunks of text.
307 | 
308 | ### Storing the Embeddings
309 | After all of the previous processing is done, the generated embeddings can be stored within a [vector database](https://aws.amazon.com/what-is/vector-databases/) that's
310 | specialized in the ability to detect the similarity of object and content based on the space between their vector point location. This is an important prerequisite to the
311 | success of this RAG approach where there can large amounts of documents that together would be larger than the token limit of current LLM's.
312 | 
313 | ## Standard Flow on Tool Usage on User Prompt Input
314 | ![image](https://github.com/WeebSoftware/ragscan/assets/50147562/472ee53d-7ccb-4f96-bd56-d1e81bd58c05)
315 | 
316 | ### User Prompt
317 | The user prompt is the other input to this tool that is highly responsible for the output in which the vector database returns and for the LLM to process. An example prompt that
318 | a pentester may have when having a large set of documents are are things like "is there an admin login page." or "what is the version of wordpress that this website is running on?".
319 | After this user prompt is gathered, the tool can start it execution
320 | 
321 | ### User Prompt Input Conversion to Embedding
322 | Because the documents gathered are now stored in the vector database alongside their embeddings, we need to convert the user input prompt into an embedding as well to calculate
323 | similarity. This is done through the use of an embedding model just as explained above with the conversion of the documents into embeddings
324 | 
325 | ### Semantic Search Using Embeddings
326 | Once the user prompt input is converted into embeddings, it is now possible to use [semantic search](https://www.elastic.co/what-is/semantic-search) to find the relevant documents that are most similar to the user input prompt.
327 | 
328 | ### Semantic Search Results
329 | Once the semantic search has been completed using the embeddings, the output should be the most relevant documents in relation to the users input prompt that are ranked on the
330 | calculated relevancy to it. For example, if a user is asking for if there is an admin login page, the top result would most likely be the admin login page document, with lower
331 | ranked results possibly being other login pages or mentions of admins.
332 | 
333 | ### Construct the Prompt to Feed to the LLM
334 | After gathering the most probable content that matches a users input, we can not just feed that directly to the LLM without any other context as it would not know exactly what the
335 | user wants. There are two other parts that will need to be added to the prompt to the LLM, one being the user input prompt, and another being a system prompt. The user input prompt
336 | is to guide the LLM to do the exact task that the user would like, such as identifying if there is an admin page. The system prompt is used to guide the LLM to answer in a specific
337 | manner and to understand the context that the user is conducting a pentest and may want a more concise response. Putting these three things together creates the full prompt that will
338 | be fed to the LLM.
339 | 
340 | ### Feeding Prompt to LLM
341 | The constructed prompt is then fed to the LLM for it to summarize and provide specific context to the user. Depending on the system prompt given to the LLM, its response could be just the fact that it identified a possible result, or it can go further and explain why it things that it is or is not what the user is looking for. This is the most unpredictable step but
342 | highly valuable given that the prompt is correct and no alignment in the LLM renders the results unusable.
343 | 
344 | ### Return the Output to the User
345 | After the LLM has done what it was tasked with doing, the generated response should return to the user alongside the location or identifier of the files used as input as part of the prompt to the LLM. This could be the location of the admin document in the directory that was used to feed documents into the vector database, or even more verbose identifiers such as line numbers or file hashes. This output should give the output that the user was looking for and making the recon process much easier.
346 | 


--------------------------------------------------------------------------------