├── .gitignore ├── README.md ├── app.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | poetry-convert.py 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # langchain-cohere-qdrant-retrieval 2 | This is a template retrieval repo to create a Flask api server using LangChain that takes a PDF file and allows to search in 100+ languages with Cohere embeddings and Qdrant Vector Database. 3 | 4 | ## Installation 5 | 6 | Install all the python dependencies using pip 7 | 8 | ```bash 9 | pip install -r requirements.txt 10 | ``` 11 | 12 | # Qdrant setup 13 | 14 | Please make an account on [Qdrant](https://qdrant.tech/) and create a new cluster. You will then be able to get the qdrant_url and qdrant_api_key used in the section below. 15 | 16 | ## Environment variables 17 | 18 | Please assign environment variables as follows. 19 | ``` 20 | cohere_api_key="insert here" 21 | openai_api_key="insert here" 22 | qdrant_url="insert here" 23 | qdrant_api_key="insert here" 24 | ``` 25 | 26 | ## Run the app 27 | 28 | Run the app using Gunicorn command 29 | 30 | ```bash 31 | gunicorn app:app 32 | ``` 33 | 34 | The app should now be running with an api route ```/embed``` and another api route ```/retrieve```. 35 | 36 | Feel free to reach out if any questions on [Twitter](https://twitter.com/MisbahSy) 37 | 38 | 39 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request 2 | from flask_cors import CORS 3 | import json 4 | 5 | # Loading environment variables 6 | import os 7 | from dotenv import load_dotenv 8 | load_dotenv() 9 | openai_api_key = os.environ.get('openai_api_key') 10 | cohere_api_key = os.environ.get('cohere_api_key') 11 | qdrant_url = os.environ.get('qdrant_url') 12 | qdrant_api_key = os.environ.get('qdrant_api_key') 13 | 14 | #Flask config 15 | app = Flask(__name__) 16 | CORS(app) 17 | 18 | # Test default route 19 | @app.route('/') 20 | def hello_world(): 21 | return {"Hello":"World"} 22 | 23 | ## Embedding code 24 | from langchain.embeddings import CohereEmbeddings 25 | from langchain.document_loaders import PyPDFLoader 26 | from langchain.vectorstores import Qdrant 27 | 28 | @app.route('/embed', methods=['POST']) 29 | def embed_pdf(): 30 | collection_name = request.json.get("collection_name") 31 | file_url = request.json.get("file_url") 32 | 33 | loader = PyPDFLoader(file_url) 34 | docs = loader.load_and_split() 35 | embeddings = CohereEmbeddings(model="multilingual-22-12", cohere_api_key=cohere_api_key) 36 | qdrant = Qdrant.from_documents(docs, embeddings, url=qdrant_url, collection_name=collection_name, prefer_grpc=True, api_key=qdrant_api_key) 37 | 38 | return {"collection_name":qdrant.collection_name} 39 | 40 | # Retrieve information from a collection 41 | from langchain.chains.question_answering import load_qa_chain 42 | from langchain.llms import OpenAI 43 | from qdrant_client import QdrantClient 44 | 45 | @app.route('/retrieve', methods=['POST']) 46 | def retrieve_info(): 47 | collection_name = request.json.get("collection_name") 48 | query = request.json.get("query") 49 | 50 | client = QdrantClient(url=qdrant_url, prefer_grpc=True, api_key=qdrant_api_key) 51 | 52 | embeddings = CohereEmbeddings(model="multilingual-22-12", cohere_api_key=cohere_api_key) 53 | qdrant = Qdrant(client=client, collection_name=collection_name, embedding_function=embeddings.embed_query) 54 | search_results = qdrant.similarity_search(query, k=2) 55 | chain = load_qa_chain(OpenAI(openai_api_key=openai_api_key,temperature=0.2), chain_type="stuff") 56 | results = chain({"input_documents": search_results, "question": query}, return_only_outputs=True) 57 | 58 | return {"results":results["output_text"]} -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | anyio==3.6.2 4 | async-timeout==4.0.2 5 | attrs==23.1.0 6 | backoff==2.2.1 7 | blinker==1.6.2 8 | certifi==2022.12.7 9 | charset-normalizer==3.1.0 10 | click==8.1.3 11 | cohere==4.2.1 12 | dataclasses-json==0.5.7 13 | Flask==2.3.1 14 | Flask-Cors==3.0.10 15 | frozenlist==1.3.3 16 | greenlet==2.0.2 17 | grpcio==1.54.0 18 | grpcio-tools==1.54.0 19 | gunicorn==20.1.0 20 | h11==0.14.0 21 | h2==4.1.0 22 | hpack==4.0.0 23 | httpcore==0.17.0 24 | httpx==0.24.0 25 | hyperframe==6.0.1 26 | idna==3.4 27 | itsdangerous==2.1.2 28 | Jinja2==3.1.2 29 | langchain==0.0.149 30 | MarkupSafe==2.1.2 31 | marshmallow==3.19.0 32 | marshmallow-enum==1.5.1 33 | multidict==6.0.4 34 | mypy-extensions==1.0.0 35 | numexpr==2.8.4 36 | numpy==1.24.3 37 | openai==0.27.4 38 | openapi-schema-pydantic==1.2.4 39 | packaging==23.1 40 | protobuf==4.22.3 41 | pydantic==1.10.7 42 | pypdf==3.8.1 43 | python-dotenv==1.0.0 44 | PyYAML==6.0 45 | qdrant-client==1.1.6 46 | requests==2.28.2 47 | six==1.16.0 48 | sniffio==1.3.0 49 | SQLAlchemy==2.0.10 50 | tenacity==8.2.2 51 | tqdm==4.65.0 52 | typing-inspect==0.8.0 53 | typing_extensions==4.5.0 54 | urllib3==1.26.15 55 | Werkzeug==2.3.0 56 | yarl==1.9.2 57 | --------------------------------------------------------------------------------