├── .env.example ├── .gitignore ├── README.md ├── __init__.py ├── app.py ├── file_utils.py ├── flask_demo.py ├── index_server.py ├── pyproject.toml ├── requirements.txt ├── startup.sh └── upload_s3.py /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=XXX 2 | AWS_ACCESS_KEY_ID=XXX 3 | AWS_SECRET_ACCESS_KEY=XXX 4 | AWS_REGION=us-east-2 5 | PINECONE_API_KEY=XXX 6 | PINECONE_REGION=us-central1-gcp 7 | MONGO_DB_URL=mongodb+srv://USER:PASSWORD@cluster0.CLUSTER_ID.mongodb.net/ 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | env/ 2 | __pycache__/ 3 | documents/ 4 | preview_images/ 5 | .env 6 | .idea 7 | .DS_Store 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # slidespeak-backend 2 | 3 | ![slidespeak-banner-github](https://github.com/SlideSpeak/slidespeak-backend/assets/5519740/6dba254f-abdd-40fd-a647-59ec2b41e0fb) 4 | 5 | [SlideSpeak](https://slidespeak.co): The ultimate AI presentation maker. Summarize PowerPoint files with AI or create entire PowerPoint presentations. Upload your PowerPoint files and use SlideSpeak to get the information you need. 6 | 7 | SlideSpeak was built with: 8 | 9 | - [Llama Index](https://github.com/jerryjliu/llama_index) and uses the OpenAI [GPT 3.5 Turbo](https://platform.openai.com/docs/models/gpt-3-5) Mobel 10 | - [PineCone](https://www.pinecone.io/) as the primary vector storage 11 | - [MongoDB](https://mongodb.com/) as the Index Store and Document Store 12 | - AWS S3 as the blob file storage 13 | 14 | The frontend for this project is available here: [https://github.com/SlideSpeak/slidespeak-webapp](https://github.com/SlideSpeak/slidespeak-webapp) 15 | 16 | ## Requirements 17 | 18 | - Python3 19 | - Pinecone 20 | - MongoDB 21 | - S3 with AWS credentials 22 | - OpenAI API credentials 23 | 24 | ## Installation 25 | 26 | - Create a virtual env: `python3 -m venv env` 27 | - Activate the virtual env: `source env/bin/activate` 28 | - Install all dependencies `python3 -m pip install -r requirements.txt` 29 | - Enable python cerificate: `sudo /Applications/Python[VERSION]/Certificates.command` 30 | - Install unoconv: `brew install unoconv` 31 | - Install libreoffice via `https://libreoffice.org/download/` 32 | - Create `.env` and set all environment variables (see `.env.example`) 33 | 34 | ## Setup 35 | 36 | _Please note:_ Both the index server and the flask backend need to run in parallel. 37 | 38 | - Start index server `python3 index_server.py` 39 | - Start Flask Backend `python3 flask_demo.py` 40 | 41 | ## License 42 | 43 | See LICENSE file. 44 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SlideSpeak/slidespeak-backend/2dc0f533785735de6e800ed0cc2a7d6cf5f0e059/__init__.py -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | 4 | def main(): 5 | server = subprocess.Popen(["python3", "index_server.py"]) 6 | flask_demo = subprocess.Popen(["python3", "flask_demo.py"]) 7 | 8 | try: 9 | # Wait for both subprocesses to finish 10 | server.communicate() 11 | flask_demo.communicate() 12 | except KeyboardInterrupt: 13 | # If the user presses CTRL+C, terminate both subprocesses 14 | server.terminate() 15 | flask_demo.terminate() 16 | 17 | 18 | if __name__ == "__main__": 19 | main() -------------------------------------------------------------------------------- /file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | import subprocess 4 | from pdf2image import convert_from_path 5 | 6 | 7 | def search_and_extract(zip_filepath, target_files, extract_to): 8 | # Ensure the target directory exists 9 | if not os.path.exists(extract_to): 10 | os.makedirs(extract_to) 11 | 12 | extracted_files = [] 13 | 14 | # Open the zip file in read mode 15 | with zipfile.ZipFile(zip_filepath, "r") as zip_ref: 16 | # Loop over the files in the zip file 17 | for filename in zip_ref.namelist(): 18 | # Check the filename part after the last slash 19 | if os.path.basename(filename) in target_files: 20 | # Extract the file 21 | zip_ref.extract(filename, extract_to) 22 | print(f"File {filename} extracted to {extract_to}") 23 | extracted_files.append(extract_to + "/" + os.path.basename(filename)) 24 | return extracted_files 25 | 26 | 27 | def ppt_preview(ppt_file_path, preview_file_path): 28 | # Check the file extension 29 | if not ppt_file_path.endswith((".ppt", ".pptx")): 30 | raise ValueError("File must be a .ppt or .pptx file") 31 | 32 | # Generate a temporary pdf path 33 | pdf_file_path = os.path.splitext(ppt_file_path)[0] + ".pdf" 34 | print(pdf_file_path) 35 | 36 | # Convert PowerPoint to PDF using unoconv 37 | subprocess.run(["unoconv", "-f", "pdf", "-o", pdf_file_path, ppt_file_path]) 38 | 39 | # Convert PDF to list of images 40 | images = convert_from_path(pdf_file_path) 41 | 42 | preview_file_paths = [] 43 | for i, image in enumerate(images): 44 | fname = os.path.splitext(preview_file_path)[0] + f"-{i}.jpg" 45 | image.save(fname, "JPEG") 46 | preview_file_paths.append(fname) 47 | # Save the first image (the first slide of the ppt) to the preview_file_path 48 | # images[0].save(preview_file_path, "JPEG") 49 | 50 | return preview_file_paths 51 | -------------------------------------------------------------------------------- /flask_demo.py: -------------------------------------------------------------------------------- 1 | import time 2 | from concurrent.futures import ThreadPoolExecutor 3 | import json 4 | import os 5 | from multiprocessing.managers import BaseManager 6 | from flask import Flask, request, jsonify, make_response, Response 7 | from flask_cors import CORS 8 | import uuid 9 | from concurrent.futures import as_completed 10 | from file_utils import ppt_preview 11 | from upload_s3 import upload_file_to_s3 12 | from werkzeug.utils import secure_filename 13 | 14 | app = Flask(__name__) 15 | app.response_buffering = False 16 | CORS(app) 17 | 18 | # initialize manager connection 19 | # TODO: you might want to handle the password in a less hardcoded way 20 | manager = BaseManager(("", 5602), b"password") 21 | manager.register("query_index") 22 | manager.register("insert_into_index") 23 | manager.register("get_documents_list") 24 | manager.register("create_index") 25 | manager.register("start_worker") 26 | manager.register("initialize_index") 27 | 28 | # Try to connect 29 | for _ in range(10): 30 | try: 31 | manager.connect() 32 | break 33 | except ConnectionRefusedError: 34 | print("Connecting to index server has failed, waiting before retrying...") 35 | time.sleep(3) 36 | 37 | executor = ThreadPoolExecutor() 38 | 39 | 40 | @app.route("/stream") 41 | def stream(): 42 | query_text = request.args.get("text", None) 43 | request.args.get("doc_id", None) 44 | uuid_id = request.args.get("uuid", None) 45 | if query_text is None: 46 | return "No text found, please include a ?text=blah parameter in the URL", 400 47 | 48 | if uuid_id is None: 49 | return "No text found, please include a ?text=blah parameter in the URL", 400 50 | manager.initialize_index(uuid_id) 51 | queue = manager.start_worker(query_text, uuid_id) 52 | 53 | def generate(): 54 | while True: 55 | response = ( 56 | queue.get() 57 | ) # This will block until there is data available in the queue 58 | if response is None: # If we get None, that means the stream is done 59 | break 60 | yield str(response) 61 | 62 | return Response(generate(), mimetype="text/event-stream") 63 | 64 | 65 | # TODO: Can we delete this route? 66 | @app.route("/query", methods=["GET"]) 67 | def query_index(): 68 | global manager 69 | query_text = request.args.get("text", None) 70 | query_doc_id = request.args.get("doc_id", None) 71 | uuid_id = request.args.get("uuid", None) 72 | if query_text is None: 73 | return "No text found, please include a ?text=blah parameter in the URL", 400 74 | if uuid_id is None: 75 | return "No UUID found, please include a uuid in the URL", 400 76 | 77 | manager.initialize_index(uuid_id) 78 | response = manager.query_index(query_text, query_doc_id)._getvalue() 79 | response_json = { 80 | "text": str(response), 81 | } 82 | return make_response(jsonify(response_json)), 200 83 | 84 | 85 | @app.route("/uploadFile", methods=["POST"]) 86 | def upload_file(): 87 | global manager, executor 88 | if "file" not in request.files: 89 | return "Please send a POST request with a file", 400 90 | 91 | filepath = None 92 | try: 93 | generated_uuid = str(uuid.uuid4()) 94 | uploaded_file = request.files["file"] 95 | filename = secure_filename(str(uuid.uuid4()) + '.pptx') 96 | filepath = os.path.join("documents", os.path.basename(filename)) 97 | 98 | start_time = time.time() 99 | uploaded_file.save(filepath) 100 | print('Saving the local PPT file: {:.2f}s'.format(time.time() - start_time)) 101 | 102 | start_time = time.time() 103 | if request.form.get("filename_as_doc_id", None) is not None: 104 | manager.insert_into_index(filepath, doc_id=filename) 105 | else: 106 | manager.insert_into_index(filepath, generated_uuid) 107 | print('Inserting into llama index: {:.2f}s'.format(time.time() - start_time)) 108 | except Exception as e: 109 | print(e) 110 | # cleanup temp file 111 | if filepath is not None and os.path.exists(filepath): 112 | os.remove(filepath) 113 | return "Error: {}".format(str(e)), 500 114 | 115 | # upload file to s3 116 | start_time = time.time() 117 | upload_done = executor.submit( 118 | upload_file_to_s3, 119 | filepath, 120 | "slidespeak-files", 121 | generated_uuid + os.path.splitext(filepath)[1], 122 | ) 123 | print('Upload PPT to S3: {:.2f}s'.format(time.time() - start_time)) 124 | 125 | # delete file after upload 126 | upload_done.add_done_callback( 127 | lambda _: os.remove(filepath) if os.path.exists(filepath) else None 128 | ) 129 | 130 | start_time = time.time() 131 | preview_file_paths = ppt_preview( 132 | filepath, "preview_images/" + generated_uuid + ".jpg" 133 | ) 134 | print('Generating PPT preview: {:.2f}s'.format(time.time() - start_time)) 135 | 136 | preview_urls_dict = {} 137 | 138 | if len(preview_file_paths) > 0: 139 | # Make a list of all futures for the uploads 140 | future_to_preview = { 141 | executor.submit( 142 | upload_file_to_s3, 143 | preview_file_path, 144 | "slidespeak-files", 145 | "preview-images/" + os.path.basename(preview_file_path) 146 | ): preview_file_path for preview_file_path in preview_file_paths 147 | } 148 | 149 | start_time = time.time() 150 | for future in as_completed(future_to_preview): 151 | preview_file_path = future_to_preview[future] 152 | try: 153 | preview_url = future.result() 154 | index = preview_file_paths.index(preview_file_path) 155 | preview_urls_dict[index] = preview_url 156 | 157 | if os.path.exists(preview_file_path): 158 | os.remove(preview_file_path) 159 | except Exception as exc: 160 | print(f'{preview_file_path} generated an exception: {exc}') 161 | print('Uploading preview images to S3: {:.2f}s'.format(time.time() - start_time)) 162 | 163 | # Convert dict to list in correct order 164 | preview_urls = [preview_urls_dict[i] for i in sorted(preview_urls_dict.keys())] 165 | 166 | return ( 167 | make_response(jsonify({"uuid": generated_uuid, "previewUrls": preview_urls})), 168 | 200, 169 | ) 170 | 171 | 172 | @app.route("/getDocuments", methods=["GET"]) 173 | def get_documents(): 174 | document_list = manager.get_documents_list()._getvalue() 175 | 176 | return make_response(jsonify(document_list)), 200 177 | 178 | 179 | @app.route("/") 180 | def home(): 181 | return "Hello, World! Welcome to the llama_index docker image!" 182 | 183 | 184 | if __name__ == "__main__": 185 | app.run(host="0.0.0.0", port=5601) 186 | -------------------------------------------------------------------------------- /index_server.py: -------------------------------------------------------------------------------- 1 | from queue import Queue 2 | import os 3 | from threading import Thread 4 | from dotenv import load_dotenv 5 | from llama_index.callbacks import LlamaDebugHandler, CallbackManager 6 | 7 | load_dotenv() 8 | 9 | import boto3 10 | import pinecone 11 | from multiprocessing.managers import BaseManager 12 | 13 | from llama_index.storage.docstore import MongoDocumentStore 14 | from llama_index.node_parser import SimpleNodeParser 15 | from llama_index.vector_stores.pinecone import PineconeVectorStore 16 | from llama_index.storage.index_store import MongoIndexStore 17 | from llama_index.llm_predictor.chatgpt import LLMPredictor 18 | from langchain.chat_models import ChatOpenAI 19 | from llama_index import ( 20 | VectorStoreIndex, 21 | ServiceContext, 22 | StorageContext, 23 | ) 24 | from llama_index import download_loader 25 | 26 | boto3.set_stream_logger("botocore", level="DEBUG") 27 | 28 | AWS_KEY = os.environ["AWS_ACCESS_KEY_ID"] 29 | AWS_SECRET = os.environ["AWS_SECRET_ACCESS_KEY"] 30 | MONGO_DB_URL = os.environ["MONGO_DB_URL"] 31 | PINECONE_API_KEY = os.environ["PINECONE_API_KEY"] 32 | PINECONE_REGION = os.environ["PINECONE_REGION"] 33 | 34 | pinecone.init( 35 | api_key=PINECONE_API_KEY, 36 | environment=PINECONE_REGION, 37 | ) 38 | 39 | import openai 40 | 41 | openai.api_key = os.environ["OPENAI_API_KEY"] 42 | 43 | index = None 44 | stored_docs = {} 45 | docstore = MongoDocumentStore.from_uri(uri=MONGO_DB_URL) 46 | index_store = MongoIndexStore.from_uri(uri=MONGO_DB_URL) 47 | llama_debug = LlamaDebugHandler(print_trace_on_end=True) 48 | callback_manager = CallbackManager([llama_debug]) 49 | 50 | PptxReader = download_loader("PptxReader") 51 | loader = PptxReader() 52 | 53 | 54 | def initialize_index(namespace): 55 | print("start to initialize index") 56 | """Create a new global index, or load one from the pre-set path.""" 57 | global index, stored_docs, docstore, index_store 58 | 59 | llm_predictor = LLMPredictor( 60 | llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", streaming=True) 61 | ) 62 | service_context = ServiceContext.from_defaults( 63 | chunk_size_limit=512, llm_predictor=llm_predictor,callback_manager=callback_manager 64 | ) 65 | print(namespace) 66 | # TODO: Move this to an env variable 67 | pinecone_index = pinecone.Index("pptx-index") 68 | vector_store = PineconeVectorStore( 69 | pinecone_index=pinecone_index, 70 | namespace=namespace, 71 | ) 72 | storage_context = StorageContext.from_defaults( 73 | docstore=docstore, index_store=index_store, vector_store=vector_store 74 | ) 75 | index = VectorStoreIndex.from_documents( 76 | [], storage_context=storage_context, service_context=service_context 77 | ) 78 | print("index initialized") 79 | 80 | 81 | def worker(queue, query_text, name): 82 | """Query the global index.""" 83 | initialize_index(name) 84 | global index 85 | streaming_response = index.as_query_engine( 86 | streaming=True, similarity_top_k=1 87 | ).query(query_text) 88 | for text in streaming_response.response_gen: 89 | # do something with text as they arrive. 90 | print(text) 91 | queue.put(text) # Put the text into the queue 92 | queue.put(None) # Signal the end of the stream 93 | 94 | 95 | def start_worker(query_text, name): 96 | print("start_worker") 97 | print(name) 98 | queue = Queue() 99 | t = Thread(target=worker, args=(queue, query_text, name)) 100 | t.start() 101 | return queue 102 | 103 | 104 | def query_index(query_text, name): 105 | """Query the global index.""" 106 | print("querying index...") 107 | global index 108 | response = index.as_query_engine().query(query_text) 109 | return response 110 | 111 | 112 | def insert_into_index(doc_file_path, doc_id=None): 113 | """Insert new document into global index.""" 114 | global index, stored_docs, docstore 115 | initialize_index(doc_id) 116 | document = loader.load_data(file=doc_file_path)[0] 117 | 118 | # create parser and parse document into nodes 119 | parser = SimpleNodeParser() 120 | nodes = parser.get_nodes_from_documents([document]) 121 | docstore.add_documents(nodes) 122 | 123 | if doc_id is not None: 124 | document.doc_id = doc_id 125 | 126 | index.insert(document) 127 | # TODO: Check if this limits the entire document that is parsed to 200 chars 128 | stored_docs[document.doc_id] = document.text[0:200] # only take the first 200 chars 129 | return 130 | 131 | 132 | def get_documents_list(): 133 | """Get the list of currently stored documents.""" 134 | global stored_docs 135 | documents_list = [] 136 | for doc_id, doc_text in stored_docs.items(): 137 | documents_list.append({"id": doc_id, "text": doc_text}) 138 | return documents_list 139 | 140 | 141 | if __name__ == "__main__": 142 | # init the global index 143 | print("initializing index...") 144 | # initialize_index() 145 | 146 | # setup server 147 | # NOTE: you might want to handle the password in a less hardcoded way 148 | manager = BaseManager(("", 5602), b"password") 149 | manager.register("query_index", query_index) 150 | manager.register("insert_into_index", insert_into_index) 151 | manager.register("get_documents_list", get_documents_list) 152 | manager.register("get_queue") 153 | manager.register("initialize_index") 154 | manager.register("start_worker", start_worker) 155 | 156 | server = manager.get_server() 157 | 158 | print("server started...") 159 | server.serve_forever() 160 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. 3 | # TODO: Enable strict linting with ["ALL"] 4 | select = ["E", "F"] 5 | # Disabled because dotenv needs to be loaded ahead of llama index 6 | ignore = ["E402"] 7 | 8 | # Allow autofix for all enabled rules (when `--fix`) is provided. 9 | fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] 10 | unfixable = [] 11 | 12 | # Exclude a variety of commonly ignored directories. 13 | exclude = [ 14 | ".bzr", 15 | ".direnv", 16 | ".eggs", 17 | ".git", 18 | ".git-rewrite", 19 | ".hg", 20 | ".mypy_cache", 21 | ".nox", 22 | ".pants.d", 23 | ".pytype", 24 | ".ruff_cache", 25 | ".svn", 26 | ".tox", 27 | ".venv", 28 | "__pypackages__", 29 | "_build", 30 | "buck-out", 31 | "build", 32 | "dist", 33 | "node_modules", 34 | "venv", 35 | ] 36 | per-file-ignores = {} 37 | 38 | # Same as Black. 39 | line-length = 88 40 | 41 | # Allow unused variables when underscore-prefixed. 42 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 43 | 44 | # Assume Python 3.10. 45 | target-version = "py310" 46 | 47 | [tool.ruff.mccabe] 48 | # Unlike Flake8, default to a complexity level of 10. 49 | max-complexity = 10 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | async-timeout==4.0.2 4 | attrs==23.1.0 5 | blinker==1.6.2 6 | boto3==1.26.155 7 | botocore==1.29.155 8 | certifi==2023.5.7 9 | charset-normalizer==3.1.0 10 | click==8.1.3 11 | dataclasses-json==0.5.8 12 | dnspython==2.3.0 13 | filelock==3.12.2 14 | Flask==2.3.2 15 | Flask-Cors==3.0.10 16 | frozenlist==1.3.3 17 | fsspec==2023.6.0 18 | huggingface-hub==0.15.1 19 | idna==3.4 20 | itsdangerous==2.1.2 21 | Jinja2==3.1.2 22 | jmespath==1.0.1 23 | langchain==0.0.202 24 | langchainplus-sdk==0.0.10 25 | llama-index==0.6.26 26 | loguru==0.7.0 27 | lxml==4.9.2 28 | MarkupSafe==2.1.3 29 | marshmallow==3.19.0 30 | marshmallow-enum==1.5.1 31 | mpmath==1.3.0 32 | multidict==6.0.4 33 | mypy-extensions==1.0.0 34 | networkx==3.1 35 | numexpr==2.8.4 36 | numpy==1.24.3 37 | openai==0.27.8 38 | openapi-schema-pydantic==1.2.4 39 | packaging==23.1 40 | pandas==2.0.2 41 | pdf2image==1.16.3 42 | Pillow==9.5.0 43 | pinecone-client==2.2.2 44 | pydantic==1.10.9 45 | pymongo==4.3.3 46 | python-dateutil==2.8.2 47 | python-dotenv==1.0.0 48 | python-pptx==0.6.21 49 | pytz==2023.3 50 | PyYAML==6.0 51 | regex==2023.6.3 52 | requests==2.31.0 53 | ruff==0.0.275 54 | s3transfer==0.6.1 55 | safetensors==0.3.1 56 | six==1.16.0 57 | SQLAlchemy==2.0.16 58 | sympy==1.12 59 | tenacity==8.2.2 60 | tiktoken==0.4.0 61 | tokenizers==0.13.3 62 | torch==2.0.1 63 | torchvision==0.15.2 64 | tqdm==4.65.0 65 | transformers==4.30.2 66 | typing-inspect==0.8.0 67 | typing_extensions==4.5.0 68 | tzdata==2023.3 69 | urllib3==1.26.16 70 | Werkzeug==2.3.6 71 | XlsxWriter==3.1.2 72 | yarl==1.9.2 73 | -------------------------------------------------------------------------------- /startup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python3 -m venv env 4 | source env/bin/activate 5 | python3 -m pip install -r /servers/slidespeak-backend/requirements.txt 6 | 7 | python3 /servers/slidespeak-backend/app.py -------------------------------------------------------------------------------- /upload_s3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import boto3 3 | 4 | s3 = boto3.client("s3") 5 | 6 | 7 | def upload_file_to_s3(file_path, bucket_name, object_name=None): 8 | # Create an S3 client 9 | s3 = boto3.client("s3") 10 | 11 | # Specify the S3 bucket and object name 12 | if object_name is None: 13 | object_name = os.path.basename(file_path) 14 | 15 | # Upload the file to S3 16 | try: 17 | s3.upload_file(file_path, bucket_name, object_name) 18 | 19 | # Construct the URL 20 | file_url = f"https://{bucket_name}.s3.amazonaws.com/{object_name}" 21 | print("File uploaded successfully.") 22 | print(file_url) 23 | return file_url 24 | except Exception as e: 25 | print("Error uploading file:", str(e)) 26 | 27 | 28 | def delete_file_by_path(filepath): 29 | if filepath is not None and os.path.exists(filepath): 30 | os.remove(filepath) 31 | --------------------------------------------------------------------------------