├── deepcrack.pdf ├── pdf_database.db ├── README.md ├── LICENSE ├── main.py ├── sqlite_vector.py ├── postgres_vector.py └── .gitignore /deepcrack.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bipark/Ollama-Gemma2-PDF-RAG/HEAD/deepcrack.pdf -------------------------------------------------------------------------------- /pdf_database.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bipark/Ollama-Gemma2-PDF-RAG/HEAD/pdf_database.db -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ollama - Gemma2 기반의 PDF RAG 검색 및 요약 2 | 3 | 이 프로젝트는 PDF 파일을 청크로 분할하고, 이를 SQLite 데이터베이스에 저장하는 Python 스크립트를 포함하고 있습니다. 목적은 PDF 데이터를 RAG(Retrieval-Augmented Generation) 모델을 사용하여 검색하고 요약하는 것입니다. 4 | 5 | LLM은 Local 모델인 Ollama-Gemma2를 사용했습니다. 6 | 7 | ## 주요 기능 8 | 9 | - **PDF 파일 로드**: `PyPDFLoader`를 사용하여 PDF 파일을 로드합니다. 10 | - **텍스트 분할**: `RecursiveCharacterTextSplitter`를 사용하여 로드된 문서를 청크로 분할합니다. 이 과정에서 청크의 크기와 중복을 조절할 수 있습니다. 11 | - **데이터베이스 저장**: 분할된 청크를 SQLite 데이터베이스에 저장합니다. 각 청크는 고유 ID와 함께 저장됩니다. 12 | - **검색**: RAG 모델을 사용하여 검색을 수행합니다. 검색 결과는 청크 ID와 함께 반환됩니다. 13 | - **요약**: 검색 결과를 요약합니다. 요약된 결과는 사용자에게 반환됩니다. 14 | 15 | ## 사용법 16 | 17 | ```bash 18 | python main.py 19 | ``` 20 | ## 실행결과 21 | 첨부된 PDF 파일을 요약하는 예시입니다. 22 | 23 | 24 | {'query': '문서를 한글로 요약해주세요.', 'result': '이 논문에서는 데이터에 허용 버퍼를 적용하여 정확도를 높이는 방법을 제안합니다. 격자 형태의 기준 데이터와 예측 데이터 사이의 차이가 일정 범위 이내에 있으면 True Positive로, 그렇지 않으면 False Positive 또는 False Negative로 분류합니다. 그림은 Ground Truth와 Prediction을 시각화하고 버퍼를 적용한 TP, FP, FN 계산 과정을 보여줍니다.\n\n저자들은 실험 설계, 모델 구현, 전처리 등 각 단계에 참여하여 스마트 복합 솔루션을 개발했습니다. \n\n\n'} 25 | 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Billy park 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sqlite3 3 | from langchain_community.document_loaders import PyPDFLoader 4 | from langchain.text_splitter import RecursiveCharacterTextSplitter 5 | from langchain_community.embeddings import HuggingFaceEmbeddings 6 | from langchain_community.llms import Ollama 7 | from langchain_community.vectorstores import FAISS 8 | from langchain.chains import RetrievalQA 9 | 10 | 11 | def make_data(): 12 | # SQLite 데이터베이스 연결 13 | conn = sqlite3.connect('pdf_database.db') 14 | cursor = conn.cursor() 15 | 16 | # 테이블 생성 (존재하지 않는 경우) 17 | cursor.execute('''CREATE TABLE IF NOT EXISTS pdf_chunks 18 | (id INTEGER PRIMARY KEY, content TEXT)''') 19 | 20 | # PDF 파일 로드 및 청크 분할 21 | loader = PyPDFLoader("deepcrack.pdf") 22 | documents = loader.load() 23 | 24 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 25 | chunks = text_splitter.split_documents(documents) 26 | 27 | # 청크를 데이터베이스에 저장 28 | for chunk in chunks: 29 | cursor.execute("INSERT INTO pdf_chunks (content) VALUES (?)", (chunk.page_content,)) 30 | conn.commit() 31 | 32 | return cursor 33 | 34 | if not os.path.exists('pdf_database.db'): 35 | cursor = make_data() 36 | else: 37 | conn = sqlite3.connect('pdf_database.db') 38 | cursor = conn.cursor() 39 | 40 | # 임베딩 생성 41 | embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") 42 | 43 | # 데이터베이스에서 청크 검색 및 FAISS 인덱스 생성 44 | cursor.execute("SELECT content FROM pdf_chunks") 45 | db_chunks = cursor.fetchall() 46 | texts = [chunk[0] for chunk in db_chunks] 47 | 48 | # FAISS 벡터 저장소 생성 49 | vectorstore = FAISS.from_texts(texts, embeddings) 50 | 51 | # 검색 기능 설정 52 | retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2}) 53 | 54 | # Ollama 모델 생성 55 | ollama_gemma2 = Ollama(model="gemma2") 56 | 57 | # QA 체인 생성 58 | qa = RetrievalQA.from_chain_type(llm=ollama_gemma2, chain_type="stuff", retriever=retriever) 59 | 60 | # 사용자 쿼리에 대한 응답 생성 61 | query = "문서를 한글로 요약해주세요." 62 | response = qa.invoke(query) 63 | 64 | print(response) 65 | -------------------------------------------------------------------------------- /sqlite_vector.py: -------------------------------------------------------------------------------- 1 | from langchain_community.document_loaders import PyPDFLoader 2 | from langchain.text_splitter import RecursiveCharacterTextSplitter 3 | from langchain_community.embeddings import HuggingFaceEmbeddings 4 | from langchain_community.embeddings.sentence_transformer import ( 5 | SentenceTransformerEmbeddings, 6 | ) 7 | from langchain_community.llms import Ollama 8 | from langchain.chains import RetrievalQA 9 | from langchain_community.vectorstores import SQLiteVSS 10 | 11 | 12 | ##--------------------------------------------------------------## 13 | def load_pdf_process(file_path): 14 | 15 | # PDF 파일 로드 및 청크 분할 16 | loader = PyPDFLoader(file_path) 17 | documents = loader.load() 18 | 19 | # 텍스트 분할 20 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 21 | texts = text_splitter.split_documents(documents) 22 | 23 | return texts 24 | 25 | ##--------------------------------------------------------------## 26 | def save_to_db(texts, embeddings): 27 | 28 | doc_func = lambda x: x.page_content 29 | docs = list(map(doc_func, texts)) 30 | 31 | vector_store = SQLiteVSS.from_texts( 32 | texts=docs, 33 | embedding=embeddings, 34 | table="documents" 35 | ) 36 | return vector_store 37 | 38 | ##--------------------------------------------------------------## 39 | def load_from_db(embeddings): 40 | conn = SQLiteVSS.create_connection(db_file="vss.db") 41 | db = SQLiteVSS(table="documents", embedding=embeddings, connection=conn) 42 | # vector_store = db.from_texts(texts=texts, embedding=embeddings) 43 | 44 | return db 45 | 46 | ##--------------------------------------------------------------## 47 | def query_to_db(query, vector_store, llm, chain_type): 48 | 49 | retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 2}) 50 | qa = RetrievalQA.from_chain_type(llm=llm, chain_type=chain_type, retriever=retriever) 51 | response = qa.invoke(query) 52 | 53 | return response 54 | 55 | ##--------------------------------------------------------------## 56 | if __name__ == "__main__": 57 | # PDF 파일 로드 및 텍스트 분할 58 | # texts = load_pdf_process("deepcrack.pdf") 59 | 60 | # 임베딩 생성 61 | # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") 62 | embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") 63 | 64 | # 벡터 저장소 생성 65 | # vector_store = save_to_db(texts, embeddings) 66 | vector_store = load_from_db(embeddings) 67 | 68 | # Ollama 모델 생성 69 | ollama_gemma2 = Ollama(model="gemma2") 70 | 71 | # 사용자 쿼리에 대한 응답 생성 72 | query = "문서를 한글로 요약해주세요." 73 | response = query_to_db(query, vector_store, ollama_gemma2, "stuff") 74 | 75 | print(response) 76 | -------------------------------------------------------------------------------- /postgres_vector.py: -------------------------------------------------------------------------------- 1 | from langchain_community.vectorstores.pgvector import PGVector 2 | from langchain_community.document_loaders import PyPDFLoader 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter 4 | from langchain_community.embeddings import HuggingFaceEmbeddings 5 | from langchain_community.embeddings.sentence_transformer import ( 6 | SentenceTransformerEmbeddings, 7 | ) 8 | from langchain_community.llms import Ollama 9 | from langchain.chains import RetrievalQA 10 | 11 | CONNECTION_STRING = "postgresql+psycopg2://postgres:!@#$%^@localhost:5432/postgres" 12 | 13 | ##--------------------------------------------------------------## 14 | def load_pdf_process(file_path): 15 | 16 | # PDF 파일 로드 및 청크 분할 17 | loader = PyPDFLoader(file_path) 18 | documents = loader.load() 19 | 20 | # 텍스트 분할 21 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 22 | texts = text_splitter.split_documents(documents) 23 | 24 | return texts 25 | 26 | ##--------------------------------------------------------------## 27 | def save_to_db(texts, embeddings): 28 | 29 | db = PGVector.from_documents( 30 | documents=texts, 31 | embedding=embeddings, 32 | connection_string=CONNECTION_STRING, 33 | collection_name="pgvector" 34 | ) 35 | return db 36 | 37 | ##--------------------------------------------------------------## 38 | def load_from_db(embeddings): 39 | 40 | db = PGVector( 41 | embedding_function=embeddings, 42 | collection_name="pgvector", 43 | connection_string=CONNECTION_STRING, 44 | use_jsonb=True, 45 | ) 46 | 47 | return db 48 | 49 | ##--------------------------------------------------------------## 50 | def query_to_db(query, vector_store, llm, chain_type): 51 | 52 | retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 2}) 53 | qa = RetrievalQA.from_chain_type(llm=llm, chain_type=chain_type, retriever=retriever) 54 | response = qa.invoke(query) 55 | 56 | return response 57 | 58 | ##--------------------------------------------------------------## 59 | if __name__ == "__main__": 60 | # PDF 파일 로드 및 텍스트 분할 61 | texts = load_pdf_process("deepcrack.pdf") 62 | 63 | # 임베딩 생성 64 | embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") 65 | # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") 66 | 67 | # 벡터 저장소 생성 68 | # vector_store = save_to_db(texts, embeddings) 69 | vector_store = load_from_db(embeddings) 70 | 71 | # Ollama 모델 생성 72 | ollama_gemma2 = Ollama(model="gemma2") 73 | 74 | # 사용자 쿼리에 대한 응답 생성 75 | query = "문서를 한글로 요약해주세요." 76 | response = query_to_db(query, vector_store, ollama_gemma2, "stuff") 77 | 78 | print(response) 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | pdf/.DS_Store 164 | .DS_Store 165 | --------------------------------------------------------------------------------