├── .github └── workflows │ └── ci.yaml ├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── README.md ├── contentmap ├── __init__.py ├── core.py ├── sitemap.py └── vss.py ├── docker-compose.yaml ├── poetry.lock ├── pyproject.toml └── tests ├── __init__.py ├── conftest.py ├── fixture.db ├── fixtures ├── sitemap_folder_a │ └── sitemap_a.xml └── sitemap_folder_b │ └── sitemap_b.xml ├── test_creator.py ├── test_sitemap.py ├── test_vss.py └── utils.py /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | 10 | jobs: 11 | Quality: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: true 15 | matrix: 16 | python-version: ["3.8", "3.9", "3.10", "3.11"] 17 | 18 | steps: 19 | - uses: actions/checkout@v3 20 | 21 | - uses: actions/setup-python@v3 22 | with: 23 | python-version: ${{matrix.python-version}} 24 | 25 | - name: Install Python Poetry 26 | uses: abatilo/actions-poetry@v2.3.0 27 | 28 | - name: Configure poetry 29 | shell: bash 30 | run: python -m poetry config virtualenvs.in-project true 31 | 32 | - name: View poetry version 33 | run: poetry --version 34 | 35 | - name: Install dependencies 36 | run: | 37 | python -m poetry install 38 | 39 | - name: Test 40 | run: poetry run pytest -v 41 | 42 | Release: 43 | needs: Quality 44 | if: | 45 | github.event_name == 'push' && 46 | github.ref == 'refs/heads/main' && 47 | !contains ( github.event.head_commit.message, 'chore(release)' ) 48 | runs-on: ubuntu-latest 49 | concurrency: release 50 | permissions: 51 | id-token: write 52 | contents: write 53 | 54 | steps: 55 | - uses: actions/setup-python@v3 56 | with: 57 | python-version: 3.8 58 | 59 | - uses: actions/checkout@v3 60 | with: 61 | fetch-depth: 0 62 | 63 | - name: Check release status 64 | id: release-status 65 | shell: bash 66 | env: 67 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 68 | run: | 69 | pip install python-semantic-release 70 | if semantic-release --noop --strict version 71 | then 72 | echo "Releasing new version." 73 | else 74 | echo "Skipping release steps." 75 | fi 76 | 77 | - if: steps.release-status.outputs.released == 'true' 78 | name: Release to GitHub 79 | id: github-release 80 | env: 81 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 82 | run: | 83 | semantic-release version 84 | git fetch --tags 85 | for file in ./dist/** 86 | do gh release upload "${{steps.release-status.outputs.tag}}" $file 87 | done 88 | 89 | # - if: steps.release-status.outputs.released == 'true' 90 | # name: Release to Test PyPI 91 | # id: test-pypi-release 92 | # env: 93 | # TEST_PYPI_TOKEN: ${{ secrets.TEST_PYPI_TOKEN }} 94 | # run: | 95 | # poetry config repositories.test-pypi https://test.pypi.org/legacy/ 96 | # poetry config pypi-token.test-pypi $TEST_PYPI_TOKEN 97 | # poetry publish -r test-pypi -u __token__ 98 | 99 | - if: steps.release-status.outputs.released == 'true' 100 | name: Release to PyPI 101 | id: pypi-release 102 | env: 103 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 104 | run: | 105 | poetry config pypi-token.pypi $PYPI_TOKEN 106 | poetry publish -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | contentmap.db 2 | .DS_Store 3 | /scratch 4 | 5 | Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | .idea/ -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | ## v0.5.0 (2024-08-24) 4 | 5 | ### Feature 6 | 7 | * feat: Add more than 1 sitemap and capability to work with xml file ([`f87a3ca`](https://github.com/philippe2803/contentmap/commit/f87a3ca29ce9ba386d2a1df403c9efcefa8432eb)) 8 | 9 | ### Unknown 10 | 11 | * Merge pull request #9 from medoror/pull-sitemaps-from-disk 12 | 13 | Pull sitemaps from disk ([`b6986a0`](https://github.com/philippe2803/contentmap/commit/b6986a080f54ea18ac515ccd714abd33f237a366)) 14 | 15 | * Pull sitemaps from disk 16 | 17 | * Provide a directory of xml sitemaps on disk as an option 18 | to initializing SitemapToContentDatabase 19 | * Allow for multiple sitemap urls ([`3911e78`](https://github.com/philippe2803/contentmap/commit/3911e78baffb0b69519a1b91e6b19508b85974cd)) 20 | 21 | * Merge pull request #6 from philippe2803/feature/alex_test 22 | 23 | test ([`318412b`](https://github.com/philippe2803/contentmap/commit/318412b3d74e60afc75e32f45baea2f8eed43399)) 24 | 25 | * test ([`b85be17`](https://github.com/philippe2803/contentmap/commit/b85be175bf94e7e00fd9bae25da5a3b8fd36d031)) 26 | 27 | * doc: Update README.md 28 | 29 | Adding a few more details and a link to article. ([`62ac810`](https://github.com/philippe2803/contentmap/commit/62ac81029063f11b235eed673445424b48c17a49)) 30 | 31 | ## v0.4.0 (2024-03-05) 32 | 33 | ### Feature 34 | 35 | * feat: Add sqlite-vss for vector search similariy capabilities 36 | 37 | * feat: Add sqlite-vss to add simioary search to sqlite 38 | 39 | * feat: Integrate with langchain for sqlite-vss implementation 40 | 41 | * feat: VSS now fully working 42 | 43 | * fix: remove unsused file 44 | 45 | * fix: Adjust unit test for similarity search ([`f165c67`](https://github.com/philippe2803/contentmap/commit/f165c67e929ee83b210b2078416a9506c37c66aa)) 46 | 47 | ## v0.3.0 (2024-02-26) 48 | 49 | ### Feature 50 | 51 | * feat: Add sqlite-vss to add similarity search to sqlite (#4) 52 | 53 | * feat: Add sqlite-vss to add simioary search to sqlite 54 | 55 | * feat: Integrate with langchain for sqlite-vss implementation ([`5865005`](https://github.com/philippe2803/contentmap/commit/5865005b3e3cba450fcae945d2c25e1e2ee05c64)) 56 | 57 | ## v0.2.0 (2024-01-09) 58 | 59 | ### Feature 60 | 61 | * feat: Add content map creator from XML sitemap (#3) 62 | 63 | * feat: Add content map creator from XML sitemap 64 | 65 | * Removing the if name snippets 66 | 67 | * fix: column name in test ([`ccbb8fb`](https://github.com/philippe2803/contentmap/commit/ccbb8fbf54faedb92f85ff0dda065c758801f6cb)) 68 | 69 | ## v0.1.1 (2024-01-08) 70 | 71 | ### Fix 72 | 73 | * fix: test semantic release ([`0fc4f4e`](https://github.com/philippe2803/contentmap/commit/0fc4f4ec8f5e0f3c78a2a4b78a9899733b577096)) 74 | 75 | ### Unknown 76 | 77 | * Merge pull request #2 from philippe2803/feature/release-dummy 78 | 79 | Updating pyproject toml to match main branch and to upload to pypi ([`de6a903`](https://github.com/philippe2803/contentmap/commit/de6a903f00969ff5a7af377bddd6c4f346b51815)) 80 | 81 | * Updating pyproject toml to match main branch and to upload to pypi ([`f0164af`](https://github.com/philippe2803/contentmap/commit/f0164afe09e7e6c0832093412d4f290aed4bf7a7)) 82 | 83 | * Merge pull request #1 from philippe2803/feature/release-dummy 84 | 85 | Testing pypi release ([`d69ee30`](https://github.com/philippe2803/contentmap/commit/d69ee30474eb57ec8e797cad560056ba3ecea58f)) 86 | 87 | * Testing pypi release ([`e8e6908`](https://github.com/philippe2803/contentmap/commit/e8e69084f08fb1ab579f352df0bbc4c973f192e1)) 88 | 89 | * Removinf test-pypi release ([`6c94ace`](https://github.com/philippe2803/contentmap/commit/6c94ace5544ac02c7f1446729d3469889b3fa128)) 90 | 91 | * Merge branch 'main' of github.com:philippe2803/contentmap into main ([`522e1b4`](https://github.com/philippe2803/contentmap/commit/522e1b42a411fb4508bd523fb6e9b4b843e8c3a3)) 92 | 93 | ## v0.1.0 (2024-01-08) 94 | 95 | ### Documentation 96 | 97 | * docs: starting readme ([`bd798dc`](https://github.com/philippe2803/contentmap/commit/bd798dcc94226b1894163bd58cf0d3e9d599361b)) 98 | 99 | ### Feature 100 | 101 | * feat: initial commit ([`34abe5e`](https://github.com/philippe2803/contentmap/commit/34abe5e3acd10422380ef231b016afb0ebca7e50)) 102 | 103 | ### Unknown 104 | 105 | * Adding workflow ([`a0ec24f`](https://github.com/philippe2803/contentmap/commit/a0ec24f2821268b8918975703ce064650088dd03)) 106 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | 4 | RUN apt update 5 | RUN apt install -y python3-pip libgomp1 libatlas-base-dev liblapack-dev libsqlite3-dev 6 | 7 | WORKDIR /app 8 | 9 | ADD poetry.lock /app/poetry.lock 10 | ADD pyproject.toml /app/pyproject.toml 11 | 12 | RUN pip install poetry 13 | RUN poetry config virtualenvs.create false 14 | RUN poetry install 15 | 16 | RUN python3 -c 'from sentence_transformers import SentenceTransformer; embedder = SentenceTransformer("all-MiniLM-L6-v2")' 17 | 18 | 19 | ADD . /app 20 | 21 | CMD ["pytest", "./tests"] 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Content map 2 | 3 | A way to share content from a specific domain using SQLite as an alternative to 4 | RSS feeds. The purpose of this library is to simply create a dataset for all the 5 | content on your website, using the XML sitemap as a starting point. 6 | 7 | Possibility to include vector search similarity features in the dataset very easily. 8 | 9 | Article that explains the rationale behind this type of datasets [here](https://philippeoger.com/pages/can-we-rag-the-whole-web/). 10 | 11 | 12 | ## Installation 13 | 14 | ```bash 15 | 16 | pip install contentmap 17 | 18 | ``` 19 | 20 | ## Quickstart 21 | 22 | To build your contentmap.db with vector search capabilities and containing all 23 | your content using your XML sitemap as a starting point, you only need to write the 24 | following: 25 | 26 | ```python 27 | from contentmap.sitemap import SitemapToContentDatabase 28 | 29 | database = SitemapToContentDatabase( 30 | sitemap_sources=["https://yourblog.com/sitemap.xml"], 31 | concurrency=10, 32 | include_vss=True 33 | ) 34 | database.build() 35 | 36 | ``` 37 | 38 | This will automatically create the SQLite database file, with vector search 39 | capabilities (piggybacking on sqlite-vss integration on Langchain). 40 | 41 | Thanks to @medoror for contributing. 42 | -------------------------------------------------------------------------------- /contentmap/__init__.py: -------------------------------------------------------------------------------- 1 | from contentmap.core import ContentMapCreator 2 | from contentmap.sitemap import SitemapToContentDatabase 3 | -------------------------------------------------------------------------------- /contentmap/core.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | from datetime import datetime 3 | import sqlite3 4 | from contentmap.vss import ContentMapVSS 5 | 6 | 7 | class ContentMapCreator: 8 | 9 | def __init__( 10 | self, 11 | contents: List[Dict[str, str]], 12 | database: str = "contentmap.db", 13 | include_vss: bool = False 14 | ): 15 | self.contents = contents 16 | self.include_vss = include_vss 17 | self.connection = sqlite3.connect(database) 18 | self.connection.row_factory = sqlite3.Row 19 | 20 | if self.include_vss: 21 | import sqlite_vss 22 | self.connection.enable_load_extension(True) 23 | sqlite_vss.load(self.connection) 24 | self.connection.enable_load_extension(False) 25 | 26 | self.cursor = self.connection.cursor() 27 | 28 | def init_db(self): 29 | self.cursor.execute("CREATE TABLE IF NOT EXISTS content (url, content)") 30 | self.cursor.execute("CREATE TABLE IF NOT EXISTS config (cat, value)") 31 | self.connection.commit() 32 | 33 | def add_config(self): 34 | data = [ 35 | {"Generated with:": "Contentmap lib"}, 36 | {"Date:": datetime.now().strftime("%Y-%m-%d %H:%M:%S")}, 37 | {"Embeddings:": "all-MiniLM-L6-v2"} 38 | ] 39 | data = [{"cat": k, "value": v} for row in data for k, v in row.items()] 40 | self.cursor.executemany("INSERT INTO config VALUES (:cat, :value)", data) 41 | self.connection.commit() 42 | 43 | def build(self): 44 | self.init_db() 45 | self.add_config() 46 | self.cursor.executemany( 47 | "INSERT INTO content VALUES (:url, :content)", 48 | self.contents 49 | ) 50 | self.connection.commit() 51 | 52 | if self.include_vss: 53 | self.add_vss() 54 | 55 | def add_vss(self): 56 | cm_vss = ContentMapVSS(connection=self.connection) 57 | cm_vss.load() 58 | -------------------------------------------------------------------------------- /contentmap/sitemap.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from typing import Literal 4 | import requests 5 | import os 6 | 7 | import aiohttp 8 | import trafilatura 9 | from tqdm.asyncio import tqdm_asyncio 10 | from lxml import etree 11 | 12 | from contentmap.core import ContentMapCreator 13 | 14 | 15 | class SitemapToContentDatabase: 16 | SOURCE_TYPE_URL: Literal['url'] = 'url' 17 | SOURCE_TYPE_DISK: Literal['disk'] = 'disk' 18 | SourceType = Literal['url', 'disk'] 19 | 20 | def __init__(self, sitemap_sources: list, 21 | source_type: SourceType = SOURCE_TYPE_URL, 22 | seconds_timeout=10, 23 | concurrency=None, 24 | include_vss=False): 25 | self.sitemap_sources = sitemap_sources 26 | self.source_type = source_type 27 | self.semaphore = asyncio.Semaphore(concurrency) if concurrency is not None else None 28 | self.timeout = aiohttp.ClientTimeout( 29 | sock_connect=seconds_timeout, 30 | sock_read=seconds_timeout 31 | ) 32 | self.include_vss = include_vss 33 | 34 | def build(self): 35 | urls = self.get_urls() 36 | loop = asyncio.get_event_loop() 37 | contents = loop.run_until_complete(self.get_contents(urls)) 38 | cm = ContentMapCreator(contents, include_vss=self.include_vss) 39 | cm.build() 40 | 41 | def get_urls(self): 42 | all_urls = [] 43 | if self.source_type == self.SOURCE_TYPE_URL: 44 | for sitemap_url in self.sitemap_sources: 45 | urls = self._get_urls_from_url(sitemap_url) 46 | all_urls.extend(urls) 47 | elif self.source_type == self.SOURCE_TYPE_DISK: 48 | for directory in self.sitemap_sources: 49 | for filename in os.listdir(directory): 50 | if filename.endswith('.xml'): 51 | filepath = os.path.join(directory, filename) 52 | urls = self._get_urls_from_disk(filepath) 53 | all_urls.extend(urls) 54 | return all_urls 55 | 56 | def _get_urls_from_url(self, sitemap_url): 57 | r = requests.get(sitemap_url) 58 | tree = etree.fromstring(r.content) 59 | return self._extract_urls_from_tree(tree) 60 | 61 | def _get_urls_from_disk(self, filepath): 62 | tree = etree.parse(filepath) 63 | return self._extract_urls_from_tree(tree) 64 | 65 | def _extract_urls_from_tree(self, tree): 66 | return [ 67 | url.text for url 68 | in tree.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc") 69 | ] 70 | 71 | async def get_contents(self, urls): 72 | async with aiohttp.ClientSession(timeout=self.timeout) as session: 73 | tasks = [self.fetch_content(session, url) for url in urls] 74 | return await tqdm_asyncio.gather(*tasks) 75 | 76 | async def fetch_content(self, session, url): 77 | try: 78 | if not self.semaphore: 79 | async with session.get(url) as response: 80 | raw = await response.text() 81 | else: 82 | async with self.semaphore, session.get(url) as response: 83 | raw = await response.text() 84 | content = trafilatura.extract(raw) 85 | return {"url": url, "content": content} 86 | 87 | except aiohttp.ClientConnectionError as e: 88 | logging.error(f"Error while fetching {url}: {e.__repr__}") 89 | return None 90 | -------------------------------------------------------------------------------- /contentmap/vss.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class ContentMapVSS to create vector search dataset from a contentmap 3 | dataset already created. 4 | """ 5 | import sqlite3 6 | from typing import Optional 7 | 8 | from langchain.text_splitter import CharacterTextSplitter 9 | from langchain_community.embeddings.sentence_transformer import ( 10 | SentenceTransformerEmbeddings, 11 | ) 12 | from langchain_community.vectorstores import SQLiteVSS 13 | 14 | 15 | class ContentMapVSS: 16 | 17 | def __init__(self, 18 | connection: Optional[sqlite3.Connection] = None, 19 | db_file: str = "contentmap.db" 20 | ): 21 | 22 | self.connection = connection 23 | if not connection: 24 | self.connection = SQLiteVSS.create_connection(db_file) 25 | 26 | embedding_function = SentenceTransformerEmbeddings( 27 | model_name="all-MiniLM-L6-v2" 28 | ) 29 | self.vss = SQLiteVSS( 30 | table="content_chunks", 31 | embedding=embedding_function, 32 | connection=self.connection 33 | ) 34 | 35 | def load(self): 36 | # content table must be there 37 | assert self.table_exists(table_name="content") 38 | texts, metadatas = self.prepare_texts_and_metadatas() 39 | self.vss.add_texts(texts=texts, metadatas=metadatas) 40 | return self.vss 41 | 42 | def table_exists(self, table_name: str) -> bool: 43 | res = self.connection.execute(f""" 44 | SELECT name 45 | FROM sqlite_master 46 | WHERE type='table' AND name='{table_name}'; 47 | """) 48 | rows = res.fetchall() 49 | if len(rows) == 1: 50 | return True 51 | return False 52 | 53 | def prepare_texts_and_metadatas(self): 54 | cursor = self.connection.cursor() 55 | result = cursor.execute("SELECT content, url FROM content") 56 | rows = result.fetchall() 57 | 58 | # based on Anyscale analysis (https://t.ly/yjgxQ), it looks like the 59 | # sweet spot is 700 chunk size and 50 chunk overlap. 60 | text_splitter = CharacterTextSplitter( 61 | chunk_size=700, chunk_overlap=50, separator="." 62 | ) 63 | 64 | texts = [] 65 | metadatas = [] 66 | for row in rows: 67 | chunks = text_splitter.split_text(row["content"]) 68 | chunk_metadatas = [{"url": row["url"]} for _ in chunks] 69 | texts += chunks 70 | metadatas += chunk_metadatas 71 | 72 | return texts, metadatas 73 | 74 | def similarity_search(self, *args, **kwargs): 75 | data = self.vss.similarity_search(*args, **kwargs) 76 | rag_results = [] 77 | for doc in data: 78 | item = {"content": doc.page_content, "url": doc.metadata['url']} 79 | rag_results.append(item) 80 | return rag_results 81 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | app: 4 | build: 5 | context: . 6 | image: contentmap_local -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "contentmap" 3 | version = "0.5.0" 4 | description = "" 5 | authors = ["Philippe Oger "] 6 | readme = "README.md" 7 | packages = [{include = "contentmap"}] 8 | 9 | 10 | [tool.poetry.dependencies] 11 | python = "^3.9" 12 | requests = "^2.31.0" 13 | tqdm = "^4.66.1" 14 | lxml = "4.9.4" 15 | trafilatura = "^1.6.4" 16 | aiohttp = "^3.9.1" 17 | sqlite-vss = "^0.1.2" 18 | langchain = "^0.1.8" 19 | sentence-transformers = "^2.3.1" 20 | 21 | 22 | [tool.poetry.group.test.dependencies] 23 | pytest = "^7.4.4" 24 | 25 | [build-system] 26 | requires = ["poetry-core"] 27 | build-backend = "poetry.core.masonry.api" 28 | 29 | 30 | [tool.semantic_release] 31 | version_toml = [ 32 | "pyproject.toml:tool.poetry.version" 33 | ] 34 | 35 | branch = "main" 36 | upload_to_PyPI = true 37 | upload_to_release = true 38 | build_command = "pip install poetry && poetry build" 39 | commit_author = "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" 40 | 41 | 42 | [tool.semantic_release.commit_parser_options] 43 | allowed_tags = [ 44 | "build", 45 | "chore", 46 | "ci", 47 | "docs", 48 | "feat", 49 | "fix", 50 | "perf", 51 | "style", 52 | "refactor", 53 | "test" 54 | ] 55 | minor_tags = ["feat"] 56 | patch_tags = ["fix", "perf"] -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philippe2803/contentmap/c9e3b4371b2809e5ae50ddbd93d1c4e89ea40fb5/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import os.path as op 4 | import logging 5 | 6 | 7 | @pytest.fixture(autouse=True) 8 | def remove_created_database_after_test(): 9 | """Fixture to execute asserts before and after a test is run""" 10 | # Setup logic 11 | yield # this is where the testing happens 12 | # Teardown logic 13 | 14 | contentmap_db_path = op.join(op.dirname(__file__), "contentmap.db") 15 | if op.exists(contentmap_db_path): 16 | logging.info('Destroying mock sqlite content instance') 17 | os.remove(contentmap_db_path) 18 | 19 | -------------------------------------------------------------------------------- /tests/fixture.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philippe2803/contentmap/c9e3b4371b2809e5ae50ddbd93d1c4e89ea40fb5/tests/fixture.db -------------------------------------------------------------------------------- /tests/fixtures/sitemap_folder_a/sitemap_a.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://www.example.com/docs/en/example/?topic=testing 5 | 6 | 7 | https://www.example.com/docs/en/example/?topic=contact-us 8 | 9 | -------------------------------------------------------------------------------- /tests/fixtures/sitemap_folder_b/sitemap_b.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://www.example.com/docs/en/example/?topic=library-overview 5 | 6 | 7 | https://www.example.com/docs/en/example/?topic=about-this-content 8 | 9 | -------------------------------------------------------------------------------- /tests/test_creator.py: -------------------------------------------------------------------------------- 1 | from contentmap.core import ContentMapCreator 2 | import sqlite3 3 | 4 | 5 | data = [ 6 | {"url": "https://www.google.com", "content": "this is google home page"}, 7 | {"url": "https://www.google.com/about", "content": "this is google about page"}, 8 | ] 9 | 10 | 11 | def test_generator(): 12 | database = ContentMapCreator(data) 13 | assert isinstance(database.connection, sqlite3.Connection) 14 | assert isinstance(database.cursor, sqlite3.Cursor) 15 | 16 | 17 | def test_schema(): 18 | database = ContentMapCreator(data) 19 | database.build() 20 | query = database.cursor.execute("SELECT count(1) FROM content") 21 | assert query.fetchone()[0] == 2 22 | 23 | 24 | def test_content_creator_vss(): 25 | database = ContentMapCreator(data, include_vss=True) 26 | database.build() 27 | query = database.cursor.execute("SELECT count(1) FROM content_chunks") 28 | assert query.fetchone()[0] == 4 29 | 30 | 31 | def test_content_creator_vss_check_chunks(): 32 | database = ContentMapCreator(data, include_vss=True) 33 | database.build() 34 | query = "SELECT distinct(tbl_name) FROM sqlite_master" 35 | result = database.cursor.execute(query) 36 | found_tables = [] 37 | for row in result: 38 | found_tables.append(row["tbl_name"]) 39 | assert "content" in found_tables 40 | assert "content_chunks" in found_tables 41 | -------------------------------------------------------------------------------- /tests/test_sitemap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import pytest 4 | 5 | from unittest.mock import patch, MagicMock 6 | from contentmap.sitemap import SitemapToContentDatabase 7 | 8 | 9 | class TestSitemapToContentDatabase(unittest.TestCase): 10 | def create_mock_response(self, content): 11 | mock_response = MagicMock() 12 | mock_response.content = content 13 | return mock_response 14 | 15 | def generate_sample_sitemap_xml(self, url): 16 | return f''' 17 | 18 | 19 | {url} 20 | 21 | ''' 22 | @patch('contentmap.sitemap.requests.get') 23 | def test_get_urls_given_one_sitemap_url(self, mock_get): 24 | mock_get.return_value = self.create_mock_response(self.generate_sample_sitemap_xml('https://www.example.com/docs/en/example/?topic=testing')) 25 | 26 | sitemap_db = SitemapToContentDatabase(sitemap_sources=['https://example.com/sitemap.xml'], source_type='url') 27 | urls = sitemap_db.get_urls() 28 | 29 | self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing']) 30 | mock_get.assert_called_once_with('https://example.com/sitemap.xml') 31 | 32 | 33 | @patch('contentmap.sitemap.requests.get') 34 | def test_get_urls_given_multiple_sitemap_urls(self, mock_get): 35 | mock_get.side_effect = [ 36 | self.create_mock_response(self.generate_sample_sitemap_xml('https://www.example.com/docs/en/example/?topic=testing')), 37 | self.create_mock_response(self.generate_sample_sitemap_xml('https://www.anotherexample.com/docs/en/example/?topic=contact-us')) 38 | ] 39 | 40 | sitemap_db = SitemapToContentDatabase(sitemap_sources=['https://example.com/sitemap.xml', 'https://anotherexample.com/sitemap.xml'], source_type='url') 41 | urls = sitemap_db.get_urls() 42 | 43 | self.assertEqual(urls, [ 44 | 'https://www.example.com/docs/en/example/?topic=testing', 45 | 'https://www.anotherexample.com/docs/en/example/?topic=contact-us' 46 | ]) 47 | mock_get.assert_any_call('https://example.com/sitemap.xml') 48 | mock_get.assert_any_call('https://anotherexample.com/sitemap.xml') 49 | self.assertEqual(mock_get.call_count, 2) 50 | 51 | def test_get_urls_given_one_location_on_disk(self): 52 | sitemap_folder_a_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_a') 53 | sitemap_db = SitemapToContentDatabase(sitemap_sources=[sitemap_folder_a_path], source_type='disk') 54 | urls = sitemap_db.get_urls() 55 | 56 | self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing', 57 | 'https://www.example.com/docs/en/example/?topic=contact-us' 58 | ]) 59 | 60 | 61 | def test_get_urls_given_multiple_locations_on_disk(self): 62 | sitemap_folder_a_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_a') 63 | sitemap_folder_b_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_b') 64 | sitemap_db = SitemapToContentDatabase(sitemap_sources=[sitemap_folder_a_path, sitemap_folder_b_path], source_type='disk') 65 | urls = sitemap_db.get_urls() 66 | 67 | self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing', 68 | 'https://www.example.com/docs/en/example/?topic=contact-us', 69 | 'https://www.example.com/docs/en/example/?topic=library-overview', 70 | 'https://www.example.com/docs/en/example/?topic=about-this-content' 71 | ]) -------------------------------------------------------------------------------- /tests/test_vss.py: -------------------------------------------------------------------------------- 1 | from contentmap.vss import ContentMapVSS 2 | import os.path as op 3 | from tests.utils import build_fixture_db 4 | 5 | 6 | class TestContentMapVSS: 7 | 8 | def test_assertion_content_exists(self): 9 | fixture_db = op.join(op.dirname(__file__), "fixture.db") 10 | vss_content = ContentMapVSS(db_file=fixture_db) 11 | assert vss_content.table_exists(table_name="content") is True 12 | 13 | def test_assertion_content_not_exists(self): 14 | vss_content = ContentMapVSS(db_file=":memory:") 15 | assert vss_content.table_exists(table_name="content") is False 16 | 17 | 18 | class TestVssTablesCreation: 19 | 20 | def test_vss_instance(self): 21 | db = build_fixture_db() 22 | cm_vss = ContentMapVSS(db_file=db) 23 | cm_vss.load() 24 | assert cm_vss.table_exists("content_chunks") 25 | 26 | def test_prepare_texts_and_metadatas(self): 27 | db = build_fixture_db() 28 | cm_vss = ContentMapVSS(db_file=db) 29 | texts, metadatas = cm_vss.prepare_texts_and_metadatas() 30 | assert len(texts) == len(metadatas) >= 1 31 | 32 | def test_chunk_table(self): 33 | db = build_fixture_db() 34 | cm_vss = ContentMapVSS(db_file=db) 35 | cm_vss.load() 36 | assert cm_vss.table_exists("content_chunks") 37 | cursor = cm_vss.connection.cursor() 38 | res = cursor.execute("SELECT * FROM content_chunks") 39 | rows = res.fetchall() 40 | assert len(rows) >= 15 41 | 42 | def test_similarity_search(self): 43 | db = build_fixture_db() 44 | cm_vss = ContentMapVSS(db_file=db) 45 | cm_vss.load() 46 | data = cm_vss.similarity_search(query="who is Mistral ai company?", k=2) 47 | assert len(data) == 2 48 | urls = [doc["url"] for doc in data] 49 | for url in urls: 50 | assert url == "https://philippeoger.com/pages/ai-scene-in-europe-last-week/" 51 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import os.path as op 2 | import shutil 3 | 4 | 5 | def build_fixture_db(): 6 | fixture_db = op.join(op.dirname(__file__), 'fixture.db') 7 | dest = op.join(op.dirname(__file__), 'contentmap.db') 8 | shutil.copy(fixture_db, dest) 9 | return dest 10 | --------------------------------------------------------------------------------