├── .gitignore ├── README.md ├── file.pdf ├── main.py ├── requirements.txt └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | files/*.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # chatPDF 2 | 3 | Load a PDF file and ask questions via llama_index and GPT 4 | 5 | ## Instructions 6 | 7 | - Install the requirements 8 | 9 | ```bash 10 | pip install -r requirements.txt 11 | ``` 12 | 13 | - Get a GPT API key from [OpenAI](https://platform.openai.com/account/api-keys) if you don't have one already. 14 | 15 | - Paste your API key in a file called `.env` in the root directory of the project. 16 | 17 | ```bash 18 | OPENAI_API_KEY= 19 | ``` 20 | 21 | - Select a file from the menu or replace the default file `file.pdf` with the PDF you want to use. 22 | 23 | - Run the script. 24 | 25 | ```bash 26 | python3 main.py 27 | ``` 28 | 29 | - Ask any questions about the content of the PDF. 30 | 31 | - You can find other loaders at [Llama Hub](https://llamahub.ai/). 32 | 33 | - Enjoy! 34 | 35 | -------------------------------------------------------------------------------- /file.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gabacode/chatPDF/d2d1e1033d58ffc5c5c4647dc2e63c7ae2dc184c/file.pdf -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig(level=logging.CRITICAL) 4 | 5 | import os 6 | from pathlib import Path 7 | 8 | import openai 9 | from dotenv import load_dotenv 10 | from langchain.chat_models import ChatOpenAI 11 | from llama_index import ( 12 | GPTVectorStoreIndex, 13 | LLMPredictor, 14 | ServiceContext, 15 | StorageContext, 16 | download_loader, 17 | load_index_from_storage, 18 | ) 19 | from utils import CACHE, FILES, models, cls, handle_save, handle_exit, initialize, select_file 20 | 21 | load_dotenv() 22 | openai.api_key = os.environ["OPENAI_API_KEY"] 23 | history = [] 24 | 25 | llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.618, model_name=models["gpt-3"], max_tokens=256)) 26 | service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size_limit=1024) 27 | 28 | 29 | def make_index(file): 30 | cls() 31 | print("👀 Loading...") 32 | 33 | PDFReader = download_loader("PDFReader") 34 | loader = PDFReader() 35 | documents = loader.load_data(file=Path(FILES) / file) 36 | 37 | if os.path.exists(Path(CACHE) / file): 38 | print("📚 Index found in cache") 39 | return 40 | else: 41 | print("📚 Index not found in cache, creating it...") 42 | index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) 43 | index.storage_context.persist(persist_dir=Path(CACHE) / file) 44 | 45 | 46 | def chat(file_name, index): 47 | while True: 48 | prompt = input("\n😎 Prompt: ") 49 | if prompt == "exit": 50 | handle_exit() 51 | elif prompt == "save": 52 | handle_save(str(file_name), history) 53 | 54 | query_engine = index.as_query_engine(response_mode="compact") 55 | 56 | response = query_engine.query(prompt) 57 | print("\n👻 Response: " + str(response)) 58 | history.append({"user": prompt, "response": str(response)}) 59 | 60 | 61 | def ask(file_name): 62 | try: 63 | print("👀 Loading...") 64 | storage_context = StorageContext.from_defaults(persist_dir=Path(CACHE) / file_name) 65 | index = load_index_from_storage(storage_context, service_context=service_context) 66 | cls() 67 | print("✅ Ready! Let's start the conversation") 68 | print("ℹ️ Press Ctrl+C to exit") 69 | chat(file_name, index) 70 | except KeyboardInterrupt: 71 | handle_exit() 72 | 73 | 74 | if __name__ == "__main__": 75 | initialize() 76 | file = select_file() 77 | if file: 78 | file_name = Path(file).name 79 | make_index(file_name) 80 | ask(file_name) 81 | else: 82 | print("No files found") 83 | handle_exit() 84 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.0.194 2 | llama_index==0.6.15 3 | openai==0.27.2 4 | python-dotenv==1.0.0 5 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import sys 4 | 5 | 6 | FILES = "./files" 7 | CACHE = f"{FILES}/.cache" 8 | 9 | models = {"davinci": "text-davinci-003", "gpt-3": "gpt-3.5-turbo"} 10 | 11 | 12 | def initialize(): 13 | if not os.path.exists(FILES): 14 | os.mkdir(FILES) 15 | if not os.path.exists(CACHE): 16 | os.mkdir(CACHE) 17 | 18 | def cls(): 19 | os.system('cls' if os.name=='nt' else 'clear') 20 | 21 | 22 | def select_file(): 23 | cls() 24 | files = [file for file in os.listdir(FILES) if file.endswith(".pdf")] 25 | if len(files) == 0: 26 | return "file.pdf" if os.path.exists("file.pdf") else None 27 | print("📁 Select a file") 28 | for i, file in enumerate(files): 29 | print(f"{i+1}. {file}") 30 | print() 31 | 32 | try: 33 | possible_selections = [i for i in range(len(files) + 1)] 34 | selection = int(input("Enter a number, or 0 to exit: ")) 35 | if selection == 0: 36 | handle_exit() 37 | elif selection not in possible_selections: 38 | select_file() 39 | else: 40 | file_path = os.path.abspath(os.path.join(FILES, files[selection - 1])) 41 | except ValueError: 42 | select_file() 43 | 44 | return file_path 45 | 46 | 47 | def handle_exit(): 48 | print("\nGoodbye!\n") 49 | sys.exit(1) 50 | 51 | def handle_save(title, history): 52 | with open(f"{title}.json", "w") as f: 53 | json.dump(history, f) --------------------------------------------------------------------------------