├── doc ├── images │ ├── 1.png │ ├── 2.png │ ├── 3.png │ └── 4.png ├── 合工大24年软工实训_课题介绍及分组选题.pdf └── Spring AI和向量数据库.md ├── .mvn └── wrapper │ ├── maven-wrapper.jar │ └── maven-wrapper.properties ├── src ├── main │ ├── resources │ │ └── application.yml │ └── java │ │ └── com │ │ └── ningning0111 │ │ └── vectordatabasedemo │ │ ├── VectorDatabaseDemoApplication.java │ │ ├── controller │ │ ├── PdfUploadController.java │ │ └── ChatController.java │ │ ├── config │ │ └── ApplicationConfig.java │ │ └── service │ │ ├── ChatService.java │ │ └── PdfStoreService.java └── test │ └── java │ └── com │ └── ningning0111 │ └── vectordatabasedemo │ └── VectorDatabaseDemoApplicationTests.java ├── README.md ├── .gitignore ├── docker-compose.yml └── pom.xml /doc/images/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NingNing0111/vector-database-demo/HEAD/doc/images/1.png -------------------------------------------------------------------------------- /doc/images/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NingNing0111/vector-database-demo/HEAD/doc/images/2.png -------------------------------------------------------------------------------- /doc/images/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NingNing0111/vector-database-demo/HEAD/doc/images/3.png -------------------------------------------------------------------------------- /doc/images/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NingNing0111/vector-database-demo/HEAD/doc/images/4.png -------------------------------------------------------------------------------- /doc/合工大24年软工实训_课题介绍及分组选题.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NingNing0111/vector-database-demo/HEAD/doc/合工大24年软工实训_课题介绍及分组选题.pdf -------------------------------------------------------------------------------- /.mvn/wrapper/maven-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NingNing0111/vector-database-demo/HEAD/.mvn/wrapper/maven-wrapper.jar -------------------------------------------------------------------------------- /.mvn/wrapper/maven-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.5/apache-maven-3.9.5-bin.zip 2 | wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar 3 | -------------------------------------------------------------------------------- /src/main/resources/application.yml: -------------------------------------------------------------------------------- 1 | server: 2 | port: 8801 3 | 4 | spring: 5 | ai: 6 | openai: 7 | base-url: https://api.example.com 8 | api-key: sk-aec103e6cfxxxxxxxxxxxxxxxxxxxxxxx71da57a 9 | 10 | datasource: 11 | username: postgres 12 | password: postgres 13 | url: jdbc:postgresql://localhost/vector_store 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |  这是Spring AI的使用demo,使用PGVector作为向量数据库。 2 | 3 | ## 介绍 4 | 5 | - 将PDF内容向量化并保存在向量数据库中,本Demo使用的是PGVector; 6 | - 调用对话前根据问题从向量数据库中检索最相似的几条记录; 7 | - 封装数据,一并返回给大语言模型 8 | - 大语言模型根据上下文数据进行回复 9 | 10 | ## 效果图 11 | 12 |  以24年合工大软工实训的pdf文件为例,通过向chatgpt提问与文档内容相关的问题。 13 | 14 | ![img.png](doc/images/1.png) 15 | 16 | ![img_1.png](doc/images/2.png) 17 | 18 | ![img_2.png](doc/images/3.png) -------------------------------------------------------------------------------- /src/main/java/com/ningning0111/vectordatabasedemo/VectorDatabaseDemoApplication.java: -------------------------------------------------------------------------------- 1 | package com.ningning0111.vectordatabasedemo; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | 6 | @SpringBootApplication 7 | public class VectorDatabaseDemoApplication { 8 | 9 | public static void main(String[] args) { 10 | SpringApplication.run(VectorDatabaseDemoApplication.class, args); 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | HELP.md 2 | target/ 3 | !.mvn/wrapper/maven-wrapper.jar 4 | !**/src/main/**/target/ 5 | !**/src/test/**/target/ 6 | 7 | ### STS ### 8 | .apt_generated 9 | .classpath 10 | .factorypath 11 | .project 12 | .settings 13 | .springBeans 14 | .sts4-cache 15 | 16 | ### IntelliJ IDEA ### 17 | .idea 18 | *.iws 19 | *.iml 20 | *.ipr 21 | 22 | ### NetBeans ### 23 | /nbproject/private/ 24 | /nbbuild/ 25 | /dist/ 26 | /nbdist/ 27 | /.nb-gradle/ 28 | build/ 29 | !**/src/main/**/build/ 30 | !**/src/test/**/build/ 31 | 32 | ### VS Code ### 33 | .vscode/ 34 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | postgres: 4 | image: ankane/pgvector:v0.5.0 5 | restart: always 6 | environment: 7 | - POSTGRES_USER=postgres 8 | - POSTGRES_PASSWORD=postgres 9 | - POSTGRES_DB=vector_store 10 | - PGPASSWORD=postgres 11 | logging: 12 | options: 13 | max-size: 10m 14 | max-file: "3" 15 | ports: 16 | - '5432:5432' 17 | healthcheck: 18 | test: "pg_isready -U postgres -d vector_store" 19 | interval: 2s 20 | timeout: 20s 21 | retries: 10 -------------------------------------------------------------------------------- /src/main/java/com/ningning0111/vectordatabasedemo/controller/PdfUploadController.java: -------------------------------------------------------------------------------- 1 | package com.ningning0111.vectordatabasedemo.controller; 2 | 3 | import com.ningning0111.vectordatabasedemo.service.PdfStoreService; 4 | import lombok.RequiredArgsConstructor; 5 | import org.springframework.stereotype.Controller; 6 | import org.springframework.web.bind.annotation.PostMapping; 7 | import org.springframework.web.bind.annotation.RequestMapping; 8 | import org.springframework.web.bind.annotation.RequestParam; 9 | import org.springframework.web.multipart.MultipartFile; 10 | 11 | 12 | /** 13 | * @Project: com.ningning0111.vectordatabasedemo.controller 14 | * @Author: pgthinker 15 | * @GitHub: https://github.com/ningning0111 16 | * @Date: 2024/2/7 18:22 17 | * @Description: 18 | */ 19 | @Controller 20 | @RequestMapping("/api/v1/pdf") 21 | @RequiredArgsConstructor 22 | public class PdfUploadController { 23 | private final PdfStoreService pdfStoreService; 24 | 25 | @PostMapping("/upload") 26 | public void upload( 27 | @RequestParam MultipartFile file 28 | ){ 29 | pdfStoreService.saveSource(file); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/com/ningning0111/vectordatabasedemo/controller/ChatController.java: -------------------------------------------------------------------------------- 1 | package com.ningning0111.vectordatabasedemo.controller; 2 | 3 | import com.ningning0111.vectordatabasedemo.service.ChatService; 4 | import lombok.RequiredArgsConstructor; 5 | import org.springframework.web.bind.annotation.GetMapping; 6 | import org.springframework.web.bind.annotation.RequestMapping; 7 | import org.springframework.web.bind.annotation.RequestParam; 8 | import org.springframework.web.bind.annotation.RestController; 9 | 10 | /** 11 | * @Project: com.ningning0111.vectordatabasedemo.controller 12 | * @Author: pgthinker 13 | * @GitHub: https://github.com/ningning0111 14 | * @Date: 2024/2/7 16:50 15 | * @Description: 16 | */ 17 | @RestController 18 | @RequiredArgsConstructor 19 | @RequestMapping("/api/v1/chat") 20 | public class ChatController { 21 | 22 | private final ChatService chatService; 23 | 24 | @GetMapping("/simple") 25 | public String simpleChat( 26 | @RequestParam String message 27 | ){ 28 | return chatService.simpleChat(message); 29 | } 30 | 31 | @GetMapping("/") 32 | public String chat( 33 | @RequestParam String message 34 | ){ 35 | return chatService.chatByVectorStore(message); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/ningning0111/vectordatabasedemo/config/ApplicationConfig.java: -------------------------------------------------------------------------------- 1 | package com.ningning0111.vectordatabasedemo.config; 2 | 3 | import org.springframework.ai.embedding.EmbeddingClient; 4 | import org.springframework.ai.transformer.splitter.TokenTextSplitter; 5 | import org.springframework.ai.vectorstore.PgVectorStore; 6 | import org.springframework.ai.vectorstore.VectorStore; 7 | import org.springframework.context.annotation.Bean; 8 | import org.springframework.context.annotation.Configuration; 9 | import org.springframework.jdbc.core.JdbcTemplate; 10 | 11 | /** 12 | * @Project: com.ningning0111.vectordatabasedemo.config 13 | * @Author: pgthinker 14 | * @GitHub: https://github.com/ningning0111 15 | * @Date: 2024/2/7 16:42 16 | * @Description: 17 | */ 18 | @Configuration 19 | public class ApplicationConfig { 20 | 21 | /** 22 | * 向量数据库进行检索操作 23 | * @param embeddingClient 24 | * @param jdbcTemplate 25 | * @return 26 | */ 27 | @Bean 28 | public VectorStore vectorStore(EmbeddingClient embeddingClient, JdbcTemplate jdbcTemplate){ 29 | return new PgVectorStore(jdbcTemplate,embeddingClient); 30 | } 31 | 32 | /** 33 | * 文本分割器 34 | * @return 35 | */ 36 | @Bean 37 | public TokenTextSplitter tokenTextSplitter() { 38 | return new TokenTextSplitter(); 39 | } 40 | 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/test/java/com/ningning0111/vectordatabasedemo/VectorDatabaseDemoApplicationTests.java: -------------------------------------------------------------------------------- 1 | package com.ningning0111.vectordatabasedemo; 2 | 3 | import com.ningning0111.vectordatabasedemo.service.ChatService; 4 | import com.ningning0111.vectordatabasedemo.service.PdfStoreService; 5 | import org.junit.jupiter.api.Test; 6 | import org.springframework.ai.reader.ExtractedTextFormatter; 7 | import org.springframework.ai.reader.pdf.PagePdfDocumentReader; 8 | import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig; 9 | import org.springframework.ai.transformer.splitter.TokenTextSplitter; 10 | import org.springframework.ai.vectorstore.VectorStore; 11 | import org.springframework.beans.factory.annotation.Autowired; 12 | import org.springframework.boot.test.context.SpringBootTest; 13 | import org.springframework.core.io.ResourceLoader; 14 | 15 | @SpringBootTest 16 | class VectorDatabaseDemoApplicationTests { 17 | 18 | @Autowired 19 | private ResourceLoader resourceLoader; 20 | 21 | @Autowired 22 | private TokenTextSplitter tokenTextSplitter; 23 | 24 | @Autowired 25 | private VectorStore vectorStore; 26 | 27 | @Autowired 28 | private ChatService chatService; 29 | 30 | @Autowired 31 | private PdfStoreService pdfStoreService; 32 | 33 | 34 | 35 | @Test 36 | public void savePdfToVectorDatabase(){ 37 | System.out.println("加载本地资源...."); 38 | var pdfSource = resourceLoader.getResource("file:" + "doc/合工大24年软工实训_课题介绍及分组选题.pdf"); 39 | var config = PdfDocumentReaderConfig.builder() 40 | .withPageExtractedTextFormatter( 41 | new ExtractedTextFormatter 42 | .Builder() 43 | .withNumberOfBottomTextLinesToDelete(3) 44 | .withNumberOfTopPagesToSkipBeforeDelete(1) 45 | .build() 46 | ) 47 | .withPagesPerDocument(1) 48 | .build(); 49 | var pagePdfDocumentReader = new PagePdfDocumentReader(pdfSource, config); 50 | vectorStore.accept(tokenTextSplitter.apply(pagePdfDocumentReader.get())); 51 | } 52 | 53 | @Test 54 | public void chatTest(){ 55 | System.out.println(chatService.chatByVectorStore("项目中较难的项目有哪些?")); 56 | } 57 | 58 | @Test 59 | public void savePdfByPage(){ 60 | String url = "file:doc/合工大24年软工实训_课题介绍及分组选题.pdf"; 61 | pdfStoreService.saveSourceByPage(url); 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/com/ningning0111/vectordatabasedemo/service/ChatService.java: -------------------------------------------------------------------------------- 1 | package com.ningning0111.vectordatabasedemo.service; 2 | 3 | import lombok.RequiredArgsConstructor; 4 | import org.springframework.ai.chat.ChatClient; 5 | import org.springframework.ai.chat.ChatResponse; 6 | import org.springframework.ai.chat.messages.Message; 7 | import org.springframework.ai.chat.messages.UserMessage; 8 | import org.springframework.ai.chat.prompt.Prompt; 9 | import org.springframework.ai.chat.prompt.SystemPromptTemplate; 10 | import org.springframework.ai.document.Document; 11 | import org.springframework.ai.vectorstore.VectorStore; 12 | import org.springframework.stereotype.Service; 13 | 14 | import java.util.List; 15 | import java.util.Map; 16 | import java.util.stream.Collectors; 17 | 18 | /** 19 | * @Project: com.ningning0111.vectordatabasedemo.service 20 | * @Author: pgthinker 21 | * @GitHub: https://github.com/ningning0111 22 | * @Date: 2024/2/7 17:34 23 | * @Description: 24 | */ 25 | @Service 26 | @RequiredArgsConstructor 27 | public class ChatService { 28 | 29 | // 系统提示词 30 | private final static String SYSTEM_PROMPT = """ 31 | 你需要使用文档内容对用户提出的问题进行回复,同时你需要表现得天生就知道这些内容, 32 | 不能在回复中体现出你是根据给出的文档内容进行回复的,这点非常重要。 33 | 34 | 当用户提出的问题无法根据文档内容进行回复或者你也不清楚时,回复不知道即可。 35 | 36 | 文档内容如下: 37 | {documents} 38 | 39 | """; 40 | 41 | private final ChatClient chatClient; 42 | private final VectorStore vectorStore; 43 | 44 | // 简单的对话,不对向量数据库进行检索 45 | public String simpleChat(String userMessage) { 46 | return chatClient.call(userMessage); 47 | } 48 | 49 | // 通过向量数据库进行检索 50 | public String chatByVectorStore(String message) { 51 | // 根据问题文本进行相似性搜索 52 | List listOfSimilarDocuments = vectorStore.similaritySearch(message); 53 | // 将Document列表中每个元素的content内容进行拼接获得documents 54 | String documents = listOfSimilarDocuments.stream().map(Document::getContent).collect(Collectors.joining()); 55 | // 使用Spring AI 提供的模板方式构建SystemMessage对象 56 | Message systemMessage = new SystemPromptTemplate(SYSTEM_PROMPT).createMessage(Map.of("documents", documents)); 57 | // 构建UserMessage对象 58 | UserMessage userMessage = new UserMessage(message); 59 | // 将Message列表一并发送给ChatGPT 60 | ChatResponse rsp = chatClient.call(new Prompt(List.of(systemMessage, userMessage))); 61 | return rsp.getResult().getOutput().getContent(); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | org.springframework.boot 7 | spring-boot-starter-parent 8 | 3.2.2 9 | 10 | 11 | com.ningning0111 12 | vector-database-demo 13 | 0.0.1-SNAPSHOT 14 | vector-database-demo 15 | vector-database-demo 16 | 17 | 17 18 | 0.8.0-SNAPSHOT 19 | 20 | 21 | 22 | org.springframework.boot 23 | spring-boot-starter-jdbc 24 | 25 | 26 | org.springframework.boot 27 | spring-boot-starter-web 28 | 29 | 30 | 31 | org.projectlombok 32 | lombok 33 | true 34 | 35 | 36 | org.springframework.boot 37 | spring-boot-starter-test 38 | test 39 | 40 | 41 | 42 | org.springframework.ai 43 | spring-ai-openai-spring-boot-starter 44 | ${spring-ai.version} 45 | 46 | 47 | 48 | org.springframework.ai 49 | spring-ai-pgvector-store-spring-boot-starter 50 | ${spring-ai.version} 51 | 52 | 53 | 54 | org.springframework.ai 55 | spring-ai-pdf-document-reader 56 | ${spring-ai.version} 57 | 58 | 59 | 60 | 61 | 62 | spring-milestones 63 | Spring Milestones 64 | https://repo.spring.io/milestone 65 | 66 | false 67 | 68 | 69 | 70 | spring-snapshots 71 | Spring Snapshots 72 | https://repo.spring.io/snapshot 73 | 74 | false 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | org.springframework.boot 83 | spring-boot-maven-plugin 84 | 85 | 86 | 87 | org.projectlombok 88 | lombok 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /src/main/java/com/ningning0111/vectordatabasedemo/service/PdfStoreService.java: -------------------------------------------------------------------------------- 1 | package com.ningning0111.vectordatabasedemo.service; 2 | 3 | import lombok.RequiredArgsConstructor; 4 | import org.springframework.ai.reader.ExtractedTextFormatter; 5 | import org.springframework.ai.reader.pdf.PagePdfDocumentReader; 6 | import org.springframework.ai.reader.pdf.ParagraphPdfDocumentReader; 7 | import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig; 8 | import org.springframework.ai.transformer.splitter.TokenTextSplitter; 9 | import org.springframework.ai.vectorstore.VectorStore; 10 | import org.springframework.core.io.DefaultResourceLoader; 11 | import org.springframework.core.io.FileSystemResource; 12 | import org.springframework.core.io.Resource; 13 | import org.springframework.stereotype.Service; 14 | import org.springframework.web.multipart.MultipartFile; 15 | 16 | import java.io.IOException; 17 | import java.nio.file.Files; 18 | import java.nio.file.Path; 19 | 20 | /** 21 | * @Project: com.ningning0111.vectordatabasedemo.service 22 | * @Author: pgthinker 23 | * @GitHub: https://github.com/ningning0111 24 | * @Date: 2024/2/7 16:48 25 | * @Description: 26 | */ 27 | @Service 28 | @RequiredArgsConstructor 29 | public class PdfStoreService { 30 | 31 | private final DefaultResourceLoader resourceLoader; 32 | private final VectorStore vectorStore; 33 | private final TokenTextSplitter tokenTextSplitter; 34 | 35 | /** 36 | * 根据PDF的页数进行分割 37 | * @param url 38 | */ 39 | public void saveSourceByPage(String url){ 40 | // 加载资源,需要本地路径的信息 41 | Resource resource = resourceLoader.getResource(url); 42 | // 加载PDF文件时的配置对象 43 | PdfDocumentReaderConfig loadConfig = PdfDocumentReaderConfig.builder() 44 | .withPageExtractedTextFormatter( 45 | new ExtractedTextFormatter 46 | .Builder() 47 | .withNumberOfBottomTextLinesToDelete(3) 48 | .withNumberOfTopPagesToSkipBeforeDelete(1) 49 | .build() 50 | ) 51 | .withPagesPerDocument(1) 52 | .build(); 53 | 54 | PagePdfDocumentReader pagePdfDocumentReader = new PagePdfDocumentReader(resource, loadConfig); 55 | // 存储到向量数据库中 56 | vectorStore.accept(tokenTextSplitter.apply(pagePdfDocumentReader.get())); 57 | } 58 | 59 | /** 60 | * 根据PDF的目录(段落)进行划分 61 | * @param url 62 | */ 63 | public void saveSourceByParagraph(String url){ 64 | Resource resource = resourceLoader.getResource(url); 65 | 66 | PdfDocumentReaderConfig loadConfig = PdfDocumentReaderConfig.builder() 67 | .withPageExtractedTextFormatter( 68 | new ExtractedTextFormatter 69 | .Builder() 70 | .withNumberOfBottomTextLinesToDelete(3) 71 | .withNumberOfTopPagesToSkipBeforeDelete(1) 72 | .build() 73 | ) 74 | .withPagesPerDocument(1) 75 | .build(); 76 | 77 | ParagraphPdfDocumentReader pdfReader = new ParagraphPdfDocumentReader( 78 | resource, 79 | loadConfig 80 | ); 81 | vectorStore.accept(tokenTextSplitter.apply(pdfReader.get())); 82 | } 83 | 84 | /** 85 | * MultipartFile对象存储,采用PagePdfDocumentReader 86 | * @param file 87 | */ 88 | public void saveSource(MultipartFile file){ 89 | try { 90 | // 获取文件名 91 | String fileName = file.getOriginalFilename(); 92 | // 获取文件内容类型 93 | String contentType = file.getContentType(); 94 | // 获取文件字节数组 95 | byte[] bytes = file.getBytes(); 96 | // 创建一个临时文件 97 | Path tempFile = Files.createTempFile("temp-", fileName); 98 | // 将文件字节数组保存到临时文件 99 | Files.write(tempFile, bytes); 100 | // 创建一个 FileSystemResource 对象 101 | Resource fileResource = new FileSystemResource(tempFile.toFile()); 102 | PdfDocumentReaderConfig loadConfig = PdfDocumentReaderConfig.builder() 103 | .withPageExtractedTextFormatter( 104 | new ExtractedTextFormatter 105 | .Builder() 106 | .withNumberOfBottomTextLinesToDelete(3) 107 | .withNumberOfTopPagesToSkipBeforeDelete(1) 108 | .build() 109 | ) 110 | .withPagesPerDocument(1) 111 | .build(); 112 | PagePdfDocumentReader pagePdfDocumentReader = new PagePdfDocumentReader(fileResource, loadConfig); 113 | vectorStore.accept(tokenTextSplitter.apply(pagePdfDocumentReader.get())); 114 | }catch (IOException e){ 115 | e.printStackTrace(); 116 | } 117 | 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /doc/Spring AI和向量数据库.md: -------------------------------------------------------------------------------- 1 | # Spring AI - 使用向量数据库实现检索式AI对话 2 | 3 |  [Spring AI](https://docs.spring.io/spring-ai/reference/) 并不仅限于针对大语言模型对话API进行了统一封装,它还可以通过简单的方式实现[LangChain](https://github.com/langchain-ai/langchain)的一些功能。本篇将带领读者实现一个简单的检索式AI对话接口。 4 | 5 | ## 一、需求背景 6 | 7 |  在一些场景下,我们想让AI根据我们提供的数据进行回复。因为对话有最大Token的限制,因此很多场景下我们是无法直接将所有的数据发给AI的,一方面在数据量很大的情况下,会突破Token的限制,另一方面,在不突破Token限制的情况下也会有不必要的对话费用开销。因此我们如何在花费最少费用的同时又能让AI更好的根据我们提供的数据进行回复是一个非常关键的问题。针对这一问题,我们可以采用数据向量化的方式来解决。 8 | 9 | ## 二、实现原理 10 | 11 | 将我们个人数据存储到向量数据库中。然后,在用户想AI发起对话之前,首先从向量数据库中检索一组相似的文档。然后,将这些文档作为用户问题的上下文,并与用户的对话一起发送到 AI 模型,从而实现精确性的回复。这种方式称为`检索增强生成(RAG)。 12 | 13 | ### 第一步:数据向量化 14 | 15 |  我们有很多种方式将数据向量化,最简单的就是通过调用第三方API来实现。以OpenAI的API为例,它提供了` 16 | https://api.openai.com/v1/embeddings` 接口,通过请求该接口可以获取某段文本的向量化的数据。具体可参考官方API介绍:[Create embeddings](https://platform.openai.com/docs/api-reference/embeddings/create)。在Spring AI中,我们不必调用该接口手动进行向量化处理,在存储到向量数据库的时候,Spring AI会自动调用的。 17 | 18 | ![img.png](images/4.png) 19 | 20 | ### 第二步:向量存储及检索 21 | 22 |  在Spring AI中有一个`VectorStore`抽象接口,该接口定义了Spring AI与向量数据库的交互操作,我们只需通过简单的向量数据库的配置即可使用该接口对向量数据库进行操作。 23 | 24 | ```java 25 | public interface VectorStore { 26 | 27 | void add(List documents); 28 | 29 | Optional delete(List idList); 30 | 31 | List similaritySearch(String query); 32 | 33 | List similaritySearch(SearchRequest request); 34 | } 35 | ``` 36 | 37 | >  向量数据库(Vector Database)是一种特殊类型的数据库,在人工智能应用中发挥着重要作用。在向量数据库中,查询操作与传统的关系数据库不同。它们是执行相似性搜索,而不是精确匹配。当给定向量作为查询时,向量数据库返回与查询向量“相似”的向量。通过这种方式,我们就能将个人的数据与AI模型进行集成。` 38 | > 39 | >  常见的向量数据库有:[`Chroma`](https://www.trychroma.com/)、[`Milvus`](https://milvus.io/)、[`Pgvector`](https://github.com/pgvector/pgvector)、[`Redis`](https://redis.io/)、[`Neo4j`](https://neo4j.com/)等。 40 | 41 | 42 | ## 三、代码实现 43 | 44 |  本篇将实现基于ChatGPT的RAG和上传PDF文件存储至向量数据库的接口,向量数据库使用`Pgvector`。Pgvector是基于PostgreSQL进行的扩展,可以存储和检索机器学习过程中生成的embeddings。 45 | 46 | > 源码已上传至GitHub: [https://github.com/NingNing0111/vector-database-demo](https://github.com/NingNing0111/vector-database-demo) 47 | 48 | ### 版本信息 49 | 50 | - JDK >= 17 51 | - Spring Boot >= 3.2.2 52 | - Spring AI = 0.8.0-SNAPSHOT 53 | 54 | ### 1. 安装Pgvector 55 | 56 |  Pgvector将使用Docker安装。`docker-compose.yml`文件如下: 57 | 58 | ```yml 59 | version: '3.7' 60 | services: 61 | postgres: 62 | image: ankane/pgvector:v0.5.0 63 | restart: always 64 | environment: 65 | - POSTGRES_USER=postgres 66 | - POSTGRES_PASSWORD=postgres 67 | - POSTGRES_DB=vector_store 68 | - PGPASSWORD=postgres 69 | logging: 70 | options: 71 | max-size: 10m 72 | max-file: "3" 73 | ports: 74 | - '5432:5432' 75 | healthcheck: 76 | test: "pg_isready -U postgres -d vector_store" 77 | interval: 2s 78 | timeout: 20s 79 | retries: 10 80 | ``` 81 | 82 | ### 2. 创建Spring项目,添加依赖 83 | 84 |  Spring 项目的创建过程略,`pom.xml`核心内容如下: 85 | 86 | ```xml 87 | 88 | 17 89 | 90 | 0.8.0-SNAPSHOT 91 | 92 | 93 | 94 | 95 | org.springframework.boot 96 | spring-boot-starter-jdbc 97 | 98 | 99 | 100 | 101 | org.springframework.ai 102 | spring-ai-openai-spring-boot-starter 103 | ${spring-ai.version} 104 | 105 | 106 | 107 | org.springframework.ai 108 | spring-ai-pgvector-store-spring-boot-starter 109 | ${spring-ai.version} 110 | 111 | 112 | 113 | org.springframework.ai 114 | spring-ai-pdf-document-reader 115 | ${spring-ai.version} 116 | 117 | 118 | 119 | 120 | 121 | spring-milestones 122 | Spring Milestones 123 | https://repo.spring.io/milestone 124 | 125 | false 126 | 127 | 128 | 129 | spring-snapshots 130 | Spring Snapshots 131 | https://repo.spring.io/snapshot 132 | 133 | false 134 | 135 | 136 | 137 | ``` 138 | 139 | ### 3. 配置API、Key、PGVector连接信息 140 | 141 | ```yaml 142 | server: 143 | port: 8801 144 | 145 | spring: 146 | ai: 147 | openai: 148 | base-url: https://api.example.com 149 | api-key: sk-aec103e6cfxxxxxxxxxxxxxxxxxxxxxxx71da57a 150 | 151 | datasource: 152 | username: postgres 153 | password: postgres 154 | url: jdbc:postgresql://localhost/vector_store 155 | 156 | ``` 157 | 158 | ### 4. 创建VectorStore和文本分割器TokenTextSplitter 159 | 160 |  这里我创建了一个`ApplicationConfig`配置类 161 | 162 | ```java 163 | package com.ningning0111.vectordatabasedemo.config; 164 | 165 | import org.springframework.ai.embedding.EmbeddingClient; 166 | import org.springframework.ai.transformer.splitter.TokenTextSplitter; 167 | import org.springframework.ai.vectorstore.PgVectorStore; 168 | import org.springframework.ai.vectorstore.VectorStore; 169 | import org.springframework.context.annotation.Bean; 170 | import org.springframework.context.annotation.Configuration; 171 | import org.springframework.jdbc.core.JdbcTemplate; 172 | 173 | @Configuration 174 | public class ApplicationConfig { 175 | 176 | /** 177 | * 向量数据库进行检索操作 178 | * @param embeddingClient 179 | * @param jdbcTemplate 180 | * @return 181 | */ 182 | @Bean 183 | public VectorStore vectorStore(EmbeddingClient embeddingClient, JdbcTemplate jdbcTemplate){ 184 | return new PgVectorStore(jdbcTemplate,embeddingClient); 185 | } 186 | 187 | /** 188 | * 文本分割器 189 | * @return 190 | */ 191 | @Bean 192 | public TokenTextSplitter tokenTextSplitter() { 193 | return new TokenTextSplitter(); 194 | } 195 | } 196 | ``` 197 | 198 | ### 5. 构建PDF存储服务层 199 | 200 |  在service层下创建一个名为`PdfStoreService`的类,用于将PDF文件存储到向量数据库中。 201 | 202 | ```java 203 | package com.ningning0111.vectordatabasedemo.service; 204 | 205 | import lombok.RequiredArgsConstructor; 206 | import org.springframework.ai.reader.ExtractedTextFormatter; 207 | import org.springframework.ai.reader.pdf.PagePdfDocumentReader; 208 | import org.springframework.ai.reader.pdf.ParagraphPdfDocumentReader; 209 | import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig; 210 | import org.springframework.ai.transformer.splitter.TokenTextSplitter; 211 | import org.springframework.ai.vectorstore.VectorStore; 212 | import org.springframework.core.io.DefaultResourceLoader; 213 | import org.springframework.core.io.FileSystemResource; 214 | import org.springframework.core.io.Resource; 215 | import org.springframework.stereotype.Service; 216 | import org.springframework.web.multipart.MultipartFile; 217 | 218 | import java.io.IOException; 219 | import java.nio.file.Files; 220 | import java.nio.file.Path; 221 | 222 | /** 223 | * @Project: com.ningning0111.vectordatabasedemo.service 224 | * @Author: pgthinker 225 | * @GitHub: https://github.com/ningning0111 226 | * @Date: 2024/2/7 16:48 227 | * @Description: 228 | */ 229 | @Service 230 | @RequiredArgsConstructor 231 | public class PdfStoreService { 232 | 233 | private final DefaultResourceLoader resourceLoader; 234 | private final VectorStore vectorStore; 235 | private final TokenTextSplitter tokenTextSplitter; 236 | 237 | /** 238 | * 根据PDF的页数进行分割 239 | * @param url 240 | */ 241 | public void saveSourceByPage(String url){ 242 | // 加载资源,需要本地路径的信息 243 | Resource resource = resourceLoader.getResource(url); 244 | // 加载PDF文件时的配置对象 245 | PdfDocumentReaderConfig loadConfig = PdfDocumentReaderConfig.builder() 246 | .withPageExtractedTextFormatter( 247 | new ExtractedTextFormatter 248 | .Builder() 249 | .withNumberOfBottomTextLinesToDelete(3) 250 | .withNumberOfTopPagesToSkipBeforeDelete(1) 251 | .build() 252 | ) 253 | .withPagesPerDocument(1) 254 | .build(); 255 | 256 | PagePdfDocumentReader pagePdfDocumentReader = new PagePdfDocumentReader(resource, loadConfig); 257 | // 存储到向量数据库中 258 | vectorStore.accept(tokenTextSplitter.apply(pagePdfDocumentReader.get())); 259 | } 260 | 261 | /** 262 | * 根据PDF的目录(段落)进行划分 263 | * @param url 264 | */ 265 | public void saveSourceByParagraph(String url){ 266 | Resource resource = resourceLoader.getResource(url); 267 | 268 | PdfDocumentReaderConfig loadConfig = PdfDocumentReaderConfig.builder() 269 | .withPageExtractedTextFormatter( 270 | new ExtractedTextFormatter 271 | .Builder() 272 | .withNumberOfBottomTextLinesToDelete(3) 273 | .withNumberOfTopPagesToSkipBeforeDelete(1) 274 | .build() 275 | ) 276 | .withPagesPerDocument(1) 277 | .build(); 278 | 279 | ParagraphPdfDocumentReader pdfReader = new ParagraphPdfDocumentReader( 280 | resource, 281 | loadConfig 282 | ); 283 | vectorStore.accept(tokenTextSplitter.apply(pdfReader.get())); 284 | } 285 | 286 | /** 287 | * MultipartFile对象存储,采用PagePdfDocumentReader 288 | * @param file 289 | */ 290 | public void saveSource(MultipartFile file){ 291 | try { 292 | // 获取文件名 293 | String fileName = file.getOriginalFilename(); 294 | // 获取文件内容类型 295 | String contentType = file.getContentType(); 296 | // 获取文件字节数组 297 | byte[] bytes = file.getBytes(); 298 | // 创建一个临时文件 299 | Path tempFile = Files.createTempFile("temp-", fileName); 300 | // 将文件字节数组保存到临时文件 301 | Files.write(tempFile, bytes); 302 | // 创建一个 FileSystemResource 对象 303 | Resource fileResource = new FileSystemResource(tempFile.toFile()); 304 | PdfDocumentReaderConfig loadConfig = PdfDocumentReaderConfig.builder() 305 | .withPageExtractedTextFormatter( 306 | new ExtractedTextFormatter 307 | .Builder() 308 | .withNumberOfBottomTextLinesToDelete(3) 309 | .withNumberOfTopPagesToSkipBeforeDelete(1) 310 | .build() 311 | ) 312 | .withPagesPerDocument(1) 313 | .build(); 314 | PagePdfDocumentReader pagePdfDocumentReader = new PagePdfDocumentReader(fileResource, loadConfig); 315 | vectorStore.accept(tokenTextSplitter.apply(pagePdfDocumentReader.get())); 316 | }catch (IOException e){ 317 | e.printStackTrace(); 318 | } 319 | 320 | } 321 | } 322 | ``` 323 | 324 | ### 6. 构建对话服务 325 | 326 |  创建`ChatService`类,该类提供了两种对话方式:`不进行检索的普通对话模式`和`对向量数据库进行检索的对话模式` 327 | 328 | ```java 329 | package com.ningning0111.vectordatabasedemo.service; 330 | 331 | import lombok.RequiredArgsConstructor; 332 | import org.springframework.ai.chat.ChatClient; 333 | import org.springframework.ai.chat.ChatResponse; 334 | import org.springframework.ai.chat.messages.Message; 335 | import org.springframework.ai.chat.messages.UserMessage; 336 | import org.springframework.ai.chat.prompt.Prompt; 337 | import org.springframework.ai.chat.prompt.SystemPromptTemplate; 338 | import org.springframework.ai.document.Document; 339 | import org.springframework.ai.vectorstore.VectorStore; 340 | import org.springframework.stereotype.Service; 341 | 342 | import java.util.List; 343 | import java.util.Map; 344 | import java.util.stream.Collectors; 345 | 346 | @Service 347 | @RequiredArgsConstructor 348 | public class ChatService { 349 | 350 | // 系统提示词 351 | private final static String SYSTEM_PROMPT = """ 352 | 你需要使用文档内容对用户提出的问题进行回复,同时你需要表现得天生就知道这些内容, 353 | 不能在回复中体现出你是根据给出的文档内容进行回复的,这点非常重要。 354 | 355 | 当用户提出的问题无法根据文档内容进行回复或者你也不清楚时,回复不知道即可。 356 | 357 | 文档内容如下: 358 | {documents} 359 | 360 | """; 361 | 362 | private final ChatClient chatClient; 363 | private final VectorStore vectorStore; 364 | 365 | // 简单的对话,不对向量数据库进行检索 366 | public String simpleChat(String userMessage) { 367 | return chatClient.call(userMessage); 368 | } 369 | 370 | // 通过向量数据库进行检索 371 | public String chatByVectorStore(String message) { 372 | // 根据问题文本进行相似性搜索 373 | List listOfSimilarDocuments = vectorStore.similaritySearch(message); 374 | // 将Document列表中每个元素的content内容进行拼接获得documents 375 | String documents = listOfSimilarDocuments.stream().map(Document::getContent).collect(Collectors.joining()); 376 | // 使用Spring AI 提供的模板方式构建SystemMessage对象 377 | Message systemMessage = new SystemPromptTemplate(SYSTEM_PROMPT).createMessage(Map.of("documents", documents)); 378 | // 构建UserMessage对象 379 | UserMessage userMessage = new UserMessage(message); 380 | // 将Message列表一并发送给ChatGPT 381 | ChatResponse rsp = chatClient.call(new Prompt(List.of(systemMessage, userMessage))); 382 | return rsp.getResult().getOutput().getContent(); 383 | } 384 | } 385 | ``` 386 | 387 | ### 7. 构建Controller层 388 | 389 |  `ChatController`提供了对话接口: 390 | 391 | ```java 392 | package com.ningning0111.vectordatabasedemo.controller; 393 | 394 | import com.ningning0111.vectordatabasedemo.service.ChatService; 395 | import lombok.RequiredArgsConstructor; 396 | import org.springframework.web.bind.annotation.GetMapping; 397 | import org.springframework.web.bind.annotation.RequestMapping; 398 | import org.springframework.web.bind.annotation.RequestParam; 399 | import org.springframework.web.bind.annotation.RestController; 400 | 401 | @RestController 402 | @RequiredArgsConstructor 403 | @RequestMapping("/api/v1/chat") 404 | public class ChatController { 405 | 406 | private final ChatService chatService; 407 | 408 | @GetMapping("/simple") 409 | public String simpleChat( 410 | @RequestParam String message 411 | ){ 412 | return chatService.simpleChat(message); 413 | } 414 | 415 | @GetMapping("/") 416 | public String chat( 417 | @RequestParam String message 418 | ){ 419 | return chatService.chatByVectorStore(message); 420 | } 421 | } 422 | ``` 423 | 424 |  `PdfUploadController`提供了上传文件并保存到向量数据库中的接口 425 | 426 | ```java 427 | package com.ningning0111.vectordatabasedemo.controller; 428 | 429 | import com.ningning0111.vectordatabasedemo.service.PdfStoreService; 430 | import lombok.RequiredArgsConstructor; 431 | import org.springframework.stereotype.Controller; 432 | import org.springframework.web.bind.annotation.PostMapping; 433 | import org.springframework.web.bind.annotation.RequestMapping; 434 | import org.springframework.web.bind.annotation.RequestParam; 435 | import org.springframework.web.multipart.MultipartFile; 436 | 437 | @Controller 438 | @RequestMapping("/api/v1/pdf") 439 | @RequiredArgsConstructor 440 | public class PdfUploadController { 441 | private final PdfStoreService pdfStoreService; 442 | 443 | @PostMapping("/upload") 444 | public void upload( 445 | @RequestParam MultipartFile file 446 | ){ 447 | pdfStoreService.saveSource(file); 448 | } 449 | } 450 | ``` 451 | 452 | ## 三、效果图 453 | 454 |  以24年合工大软工实训的pdf文件为例,通过向chatgpt提问与文档内容相关的问题。 455 | 456 | ![img.png](images/1.png) 457 | 458 | ![img_1.png](images/2.png) 459 | 460 | ![img_2.png](images/3.png) 461 | --------------------------------------------------------------------------------