├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── docs └── assets │ ├── documentchunk-instances.jpg │ └── parse-and-index.jpg ├── requirements.txt ├── setup.py └── src └── django_langchain ├── __init__.py ├── admin.py ├── apps.py ├── migrations ├── 0001_initial.py ├── 0002_alter_documentchunk_index.py └── __init__.py ├── models.py ├── tests.py └── views.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | sample-data/ 162 | .DS_Store 163 | vars.env 164 | vars.list 165 | bin 166 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Pierre Alexandre SCHEMBRI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include MANIFEST.in 3 | include README.md 4 | include requirements.txt 5 | graft src/django_langchain 6 | graft docs 7 | global-exclude __pycache__ 8 | global-exclude *.py[co] 9 | global-exclude *.DS_Store 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: setup 2 | 3 | 4 | venv: 5 | @python3.9 -m venv venv 6 | -ln -s venv/bin . 7 | 8 | 9 | setup: venv 10 | @venv/bin/pip3 install -U pip 11 | @venv/bin/pip3 install -r requirements.txt 12 | @venv/bin/pip3 install -e . 13 | 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Django Langchain 2 | 3 | **EDIT - 2024/09** 4 | 5 | **This project is no longer maintained.** 6 | 7 | **If you need the ownership of the pypi package [django-langchain](https://pypi.org/project/django-langchain/), please open an issue.** 8 | 9 | 10 | ## What's this ? 11 | 12 | This package aims to provide integration between LangChain and Django. 13 | 14 | This is very early and the current stage is : `alpha` 15 | 16 | ## Compatibility 17 | 18 | LangChain is moving at a fast pace and interfaces may change. 19 | Version pinning is not enforced at the time to be able to use latest releases. 20 | 21 | This package is built using 22 | 23 | - LangChain v0.0.229 24 | - Django 4.2 25 | - Python 3.9 26 | 27 | ## Database requirements 28 | 29 | You have to use a pgvector-enabled Postgresql database 30 | 31 | 32 | ## Roadmap 33 | 34 | Never ending todo : 35 | 36 | - [X] Base document models (chunks and sources) 37 | - [X] A LangChain-compliant document store 38 | - [X] Embeddings are computed via `SourceDocument.parse_file` 39 | - [X] Bare-bone admin management interface 40 | - [ ] Serializers (output) to use with Django REST Framework 41 | - [ ] Comprehensive set of utilities binding LLM & chains 42 | - [ ] (TBC)Serializers (input) to use with Django REST Framework 43 | - [ ] (TBC) Add support for celery or other async queuing system 44 | - [ ] (TBC) Add support for data visualization in django admin 45 | 46 | 47 | ## Quickstart 48 | 49 | ### Install 50 | 51 | ```shell 52 | pip install django-langchain 53 | ``` 54 | 55 | ### Using the admin site 56 | 57 | 1. Store a file in `SourceDocument` model 58 | 2. Perform the admin action `Parse and index file` 59 | 60 | ![](docs/assets/parse-and-index.jpg) 61 | 62 | 3. Check for `DocumentChunk` instances 63 | 64 | ![](docs/assets/documentchunk-instances.jpg) 65 | 66 | 4. Perform a similarity search using Django's admin built-in search (returns 5 most relevant results) 67 | 68 | 69 | ### Using the console 70 | 71 | ```python 72 | from django_langchain.models import SourceDocument 73 | 74 | source_doc = SourceDocument(title='< Title >', file='< /path/to/file/ >') 75 | source_doc.save() 76 | 77 | #: This could take some time and some non-trivial amount of RAM 78 | source_doc.parse_file() 79 | print(source_doc.documentchunk_set.count()) 80 | ``` 81 | 82 | ```python 83 | from django_langchain.models import DocumentChunk, DocumentChunkStore 84 | 85 | # Perform a similarity search 86 | queryset = DocumentChunk.objects.search('< query >', max_results=5) 87 | 88 | # Or using DocumentStore Langchain interface 89 | 90 | store = DocumentChunkStore() 91 | source, doc = store.search("< query >") 92 | 93 | print(doc.metadata) 94 | print(doc.page_content) 95 | ``` 96 | -------------------------------------------------------------------------------- /docs/assets/documentchunk-instances.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paschembri/django-langchain/001ed3c2e22a9ffba9c5022f2dc066f2fdedb82f/docs/assets/documentchunk-instances.jpg -------------------------------------------------------------------------------- /docs/assets/parse-and-index.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paschembri/django-langchain/001ed3c2e22a9ffba9c5022f2dc066f2fdedb82f/docs/assets/parse-and-index.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | django 2 | djangorestframework 3 | langchain 4 | sentence_transformers 5 | unstructured 6 | openai 7 | pgvector 8 | psycopg[binary] 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from pathlib import Path 3 | from setuptools import setup, find_packages 4 | 5 | 6 | here = Path(__file__).parent 7 | packages = find_packages("src") 8 | main_package = packages[0] 9 | long_description = (here / "README.md").read_text() 10 | requirements = (here / "requirements.txt").read_text().split("\n") 11 | 12 | setup( 13 | name="django-langchain", 14 | version="0.0.1", 15 | license="All rights reserved. 2023", 16 | description="Django Langchain", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | author=" P.A. SCHEMBRI", 20 | author_email="pa.schembri@advanced-stack.com", 21 | url="https://github.com/paschembri/django-langchain", 22 | packages=packages, 23 | package_dir={"": "src"}, 24 | include_package_data=True, 25 | install_requires=requirements, 26 | ) 27 | -------------------------------------------------------------------------------- /src/django_langchain/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paschembri/django-langchain/001ed3c2e22a9ffba9c5022f2dc066f2fdedb82f/src/django_langchain/__init__.py -------------------------------------------------------------------------------- /src/django_langchain/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | from .models import SourceDocument, DocumentChunk 3 | 4 | 5 | @admin.action(description="Parse and index file") 6 | def parse_and_index(modeladmin, request, queryset): 7 | for instance in queryset: 8 | instance.parse_file() 9 | 10 | 11 | class SourceDocumentAdmin(admin.ModelAdmin): 12 | list_display = ["title", "indexed", "chunks_count"] 13 | actions = [parse_and_index] 14 | 15 | @admin.display(description="Number of chunks") 16 | def chunks_count(self, instance): 17 | return instance.documentchunk_set.count() 18 | 19 | 20 | class DocumentChunkAdmin(admin.ModelAdmin): 21 | exclude = ["embedding", "index"] 22 | search_fields = ["page_content"] 23 | 24 | def get_search_results(self, request, queryset, search_term): 25 | if not search_term: 26 | return queryset, False 27 | 28 | results = queryset.intersection(self.model.objects.search(search_term)) 29 | return results, False 30 | 31 | 32 | admin.site.register(SourceDocument, SourceDocumentAdmin) 33 | admin.site.register(DocumentChunk, DocumentChunkAdmin) 34 | -------------------------------------------------------------------------------- /src/django_langchain/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class DjangoLangchainConfig(AppConfig): 5 | default_auto_field = "django.db.models.BigAutoField" 6 | name = "django_langchain" 7 | verbose_name = "LangChain" 8 | -------------------------------------------------------------------------------- /src/django_langchain/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 4.2.3 on 2023-07-11 13:53 2 | 3 | from django.db import migrations, models 4 | import django.db.models.deletion 5 | import pgvector.django 6 | 7 | 8 | class Migration(migrations.Migration): 9 | initial = True 10 | 11 | dependencies = [] 12 | 13 | operations = [ 14 | pgvector.django.VectorExtension(), 15 | migrations.CreateModel( 16 | name="SourceDocument", 17 | fields=[ 18 | ( 19 | "id", 20 | models.BigAutoField( 21 | auto_created=True, 22 | primary_key=True, 23 | serialize=False, 24 | verbose_name="ID", 25 | ), 26 | ), 27 | ("title", models.CharField(blank=True, default="")), 28 | ("file", models.FileField(upload_to="")), 29 | ("indexed", models.BooleanField(default=False)), 30 | ], 31 | ), 32 | migrations.CreateModel( 33 | name="DocumentChunk", 34 | fields=[ 35 | ( 36 | "id", 37 | models.BigAutoField( 38 | auto_created=True, 39 | primary_key=True, 40 | serialize=False, 41 | verbose_name="ID", 42 | ), 43 | ), 44 | ("metadata", models.JSONField()), 45 | ("page_content", models.TextField()), 46 | ("embedding", pgvector.django.VectorField(dimensions=768)), 47 | ( 48 | "source", 49 | models.ForeignKey( 50 | on_delete=django.db.models.deletion.CASCADE, 51 | to="django_langchain.sourcedocument", 52 | ), 53 | ), 54 | ("index", models.IntegerField(default=0)), 55 | ], 56 | ), 57 | ] 58 | -------------------------------------------------------------------------------- /src/django_langchain/migrations/0002_alter_documentchunk_index.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 4.2.3 on 2023-07-11 13:55 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("django_langchain", "0001_initial"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AlterField( 13 | model_name="documentchunk", 14 | name="index", 15 | field=models.IntegerField(), 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /src/django_langchain/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paschembri/django-langchain/001ed3c2e22a9ffba9c5022f2dc066f2fdedb82f/src/django_langchain/migrations/__init__.py -------------------------------------------------------------------------------- /src/django_langchain/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | from django.db import models 3 | from pgvector.django import VectorField, CosineDistance 4 | from langchain.document_loaders import UnstructuredFileLoader 5 | from langchain.docstore.base import Docstore 6 | from langchain.embeddings.huggingface import HuggingFaceEmbeddings 7 | 8 | 9 | embedding_model = HuggingFaceEmbeddings() 10 | 11 | 12 | class SourceDocument(models.Model): 13 | title = models.CharField(blank=True, default="") 14 | file = models.FileField() 15 | indexed = models.BooleanField(default=False) 16 | 17 | def parse_file(self, compute_embeddings=True): 18 | loader = UnstructuredFileLoader(self.file.path, mode="elements") 19 | 20 | docs = [] 21 | for index, doc in enumerate(loader.load(), start=1): 22 | docs.append( 23 | DocumentChunk( 24 | index=index, 25 | metadata=doc.metadata, 26 | page_content=doc.page_content, 27 | source=self, 28 | ) 29 | ) 30 | 31 | if compute_embeddings: 32 | for doc in docs: 33 | doc.embedding = embedding_model.embed_query(doc.page_content) 34 | 35 | DocumentChunk.objects.bulk_create(docs) 36 | self.indexed = True 37 | self.save() 38 | 39 | 40 | class DocumentChunkManager(models.Manager): 41 | def search(self, query, max_results=5): 42 | query_embedding = embedding_model.embed_query(query) 43 | 44 | results = DocumentChunk.objects.order_by( 45 | CosineDistance("embedding", query_embedding) 46 | )[:max_results] 47 | 48 | return results 49 | 50 | 51 | class DocumentChunk(models.Model): 52 | index = models.IntegerField() 53 | metadata = models.JSONField() 54 | page_content = models.TextField() 55 | embedding = VectorField(dimensions=768) 56 | source = models.ForeignKey(SourceDocument, on_delete=models.CASCADE) 57 | 58 | objects = DocumentChunkManager() 59 | 60 | def __str__(self): 61 | source = os.path.basename(self.metadata.get("source")) 62 | 63 | return f"[{source}][chunk:{self.index}] p{self.metadata.get('page_number')}" 64 | 65 | 66 | class DocumentChunkStore(Docstore): 67 | def search(self, search: str): 68 | """ 69 | Perform a similarity search throughout documents (i.e. text chunks) 70 | """ 71 | chunk = DocumentChunk.objects.search(search)[0] 72 | return chunk.metadata.get("source"), chunk 73 | -------------------------------------------------------------------------------- /src/django_langchain/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /src/django_langchain/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | --------------------------------------------------------------------------------