├── .github
    └── workflows
    │   └── ci.yaml
├── .gitignore
├── CHANGELOG.md
├── Dockerfile
├── README.md
├── contentmap
    ├── __init__.py
    ├── core.py
    ├── sitemap.py
    └── vss.py
├── docker-compose.yaml
├── poetry.lock
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── fixture.db
    ├── fixtures
        ├── sitemap_folder_a
        │   └── sitemap_a.xml
        └── sitemap_folder_b
        │   └── sitemap_b.xml
    ├── test_creator.py
    ├── test_sitemap.py
    ├── test_vss.py
    └── utils.py


/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ main ]
  6 |   pull_request:
  7 |     branches: [ main ]
  8 | 
  9 | 
 10 | jobs:
 11 |   Quality:
 12 |     runs-on: ubuntu-latest
 13 |     strategy:
 14 |         fail-fast: true
 15 |         matrix:
 16 |           python-version: ["3.8", "3.9", "3.10", "3.11"]
 17 | 
 18 |     steps:
 19 |     - uses: actions/checkout@v3
 20 | 
 21 |     - uses: actions/setup-python@v3
 22 |       with:
 23 |         python-version: ${{matrix.python-version}}
 24 | 
 25 |     - name: Install Python Poetry
 26 |       uses: abatilo/actions-poetry@v2.3.0
 27 | 
 28 |     - name: Configure poetry
 29 |       shell: bash
 30 |       run: python -m poetry config virtualenvs.in-project true
 31 | 
 32 |     - name: View poetry version
 33 |       run: poetry --version
 34 | 
 35 |     - name: Install dependencies
 36 |       run: |
 37 |         python -m poetry install
 38 | 
 39 |     - name: Test
 40 |       run: poetry run pytest -v
 41 | 
 42 |   Release:
 43 |     needs: Quality
 44 |     if: |
 45 |       github.event_name == 'push' && 
 46 |       github.ref == 'refs/heads/main' && 
 47 |       !contains ( github.event.head_commit.message, 'chore(release)' )
 48 |     runs-on: ubuntu-latest
 49 |     concurrency: release
 50 |     permissions:
 51 |       id-token: write
 52 |       contents: write
 53 | 
 54 |     steps:
 55 |       - uses: actions/setup-python@v3
 56 |         with:
 57 |           python-version: 3.8
 58 | 
 59 |       - uses: actions/checkout@v3
 60 |         with:
 61 |           fetch-depth: 0
 62 | 
 63 |       - name: Check release status
 64 |         id: release-status
 65 |         shell: bash
 66 |         env:
 67 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 68 |         run: |
 69 |           pip install python-semantic-release
 70 |           if semantic-release --noop --strict version
 71 |           then
 72 |             echo "Releasing new version."
 73 |           else
 74 |             echo "Skipping release steps."
 75 |           fi
 76 | 
 77 |       - if: steps.release-status.outputs.released == 'true'
 78 |         name: Release to GitHub
 79 |         id: github-release
 80 |         env:
 81 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 82 |         run: |
 83 |           semantic-release version
 84 |           git fetch --tags
 85 |           for file in ./dist/**
 86 |             do gh release upload "${{steps.release-status.outputs.tag}}" $file
 87 |           done
 88 | 
 89 | #      - if: steps.release-status.outputs.released == 'true'
 90 | #        name: Release to Test PyPI
 91 | #        id: test-pypi-release
 92 | #        env:
 93 | #          TEST_PYPI_TOKEN: ${{ secrets.TEST_PYPI_TOKEN }}
 94 | #        run: |
 95 | #          poetry config repositories.test-pypi https://test.pypi.org/legacy/
 96 | #          poetry config pypi-token.test-pypi $TEST_PYPI_TOKEN
 97 | #          poetry publish -r test-pypi -u __token__
 98 | 
 99 |       - if: steps.release-status.outputs.released == 'true'
100 |         name: Release to PyPI
101 |         id: pypi-release
102 |         env:
103 |           PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
104 |         run: |
105 |           poetry config pypi-token.pypi $PYPI_TOKEN
106 |           poetry publish


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | contentmap.db
  2 | .DS_Store
  3 | /scratch
  4 | 
  5 |  Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | .idea/


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # CHANGELOG
  2 | 
  3 | ## v0.5.0 (2024-08-24)
  4 | 
  5 | ### Feature
  6 | 
  7 | * feat: Add more than 1 sitemap and capability to work with xml file ([`f87a3ca`](https://github.com/philippe2803/contentmap/commit/f87a3ca29ce9ba386d2a1df403c9efcefa8432eb))
  8 | 
  9 | ### Unknown
 10 | 
 11 | * Merge pull request #9 from medoror/pull-sitemaps-from-disk
 12 | 
 13 | Pull sitemaps from disk ([`b6986a0`](https://github.com/philippe2803/contentmap/commit/b6986a080f54ea18ac515ccd714abd33f237a366))
 14 | 
 15 | * Pull sitemaps from disk
 16 | 
 17 | * Provide a directory of xml sitemaps on disk as an option
 18 |   to initializing SitemapToContentDatabase
 19 | * Allow for multiple sitemap urls ([`3911e78`](https://github.com/philippe2803/contentmap/commit/3911e78baffb0b69519a1b91e6b19508b85974cd))
 20 | 
 21 | * Merge pull request #6 from philippe2803/feature/alex_test
 22 | 
 23 | test ([`318412b`](https://github.com/philippe2803/contentmap/commit/318412b3d74e60afc75e32f45baea2f8eed43399))
 24 | 
 25 | * test ([`b85be17`](https://github.com/philippe2803/contentmap/commit/b85be175bf94e7e00fd9bae25da5a3b8fd36d031))
 26 | 
 27 | * doc: Update README.md
 28 | 
 29 | Adding a few more details and a link to article. ([`62ac810`](https://github.com/philippe2803/contentmap/commit/62ac81029063f11b235eed673445424b48c17a49))
 30 | 
 31 | ## v0.4.0 (2024-03-05)
 32 | 
 33 | ### Feature
 34 | 
 35 | * feat: Add sqlite-vss for vector search similariy capabilities
 36 | 
 37 | * feat: Add sqlite-vss to add simioary search to sqlite
 38 | 
 39 | * feat: Integrate with langchain for sqlite-vss implementation
 40 | 
 41 | * feat: VSS now fully working
 42 | 
 43 | * fix: remove unsused file
 44 | 
 45 | * fix: Adjust unit test for similarity search ([`f165c67`](https://github.com/philippe2803/contentmap/commit/f165c67e929ee83b210b2078416a9506c37c66aa))
 46 | 
 47 | ## v0.3.0 (2024-02-26)
 48 | 
 49 | ### Feature
 50 | 
 51 | * feat: Add sqlite-vss to add similarity search to sqlite (#4)
 52 | 
 53 | * feat: Add sqlite-vss to add simioary search to sqlite
 54 | 
 55 | * feat: Integrate with langchain for sqlite-vss implementation ([`5865005`](https://github.com/philippe2803/contentmap/commit/5865005b3e3cba450fcae945d2c25e1e2ee05c64))
 56 | 
 57 | ## v0.2.0 (2024-01-09)
 58 | 
 59 | ### Feature
 60 | 
 61 | * feat: Add content map creator from XML sitemap (#3)
 62 | 
 63 | * feat: Add content map creator from XML sitemap
 64 | 
 65 | * Removing the if name snippets
 66 | 
 67 | * fix: column name in test ([`ccbb8fb`](https://github.com/philippe2803/contentmap/commit/ccbb8fbf54faedb92f85ff0dda065c758801f6cb))
 68 | 
 69 | ## v0.1.1 (2024-01-08)
 70 | 
 71 | ### Fix
 72 | 
 73 | * fix: test semantic release ([`0fc4f4e`](https://github.com/philippe2803/contentmap/commit/0fc4f4ec8f5e0f3c78a2a4b78a9899733b577096))
 74 | 
 75 | ### Unknown
 76 | 
 77 | * Merge pull request #2 from philippe2803/feature/release-dummy
 78 | 
 79 | Updating pyproject toml to match main branch and to upload to pypi ([`de6a903`](https://github.com/philippe2803/contentmap/commit/de6a903f00969ff5a7af377bddd6c4f346b51815))
 80 | 
 81 | * Updating pyproject toml to match main branch and to upload to pypi ([`f0164af`](https://github.com/philippe2803/contentmap/commit/f0164afe09e7e6c0832093412d4f290aed4bf7a7))
 82 | 
 83 | * Merge pull request #1 from philippe2803/feature/release-dummy
 84 | 
 85 | Testing pypi release ([`d69ee30`](https://github.com/philippe2803/contentmap/commit/d69ee30474eb57ec8e797cad560056ba3ecea58f))
 86 | 
 87 | * Testing pypi release ([`e8e6908`](https://github.com/philippe2803/contentmap/commit/e8e69084f08fb1ab579f352df0bbc4c973f192e1))
 88 | 
 89 | * Removinf test-pypi release ([`6c94ace`](https://github.com/philippe2803/contentmap/commit/6c94ace5544ac02c7f1446729d3469889b3fa128))
 90 | 
 91 | * Merge branch &#39;main&#39; of github.com:philippe2803/contentmap into main ([`522e1b4`](https://github.com/philippe2803/contentmap/commit/522e1b42a411fb4508bd523fb6e9b4b843e8c3a3))
 92 | 
 93 | ## v0.1.0 (2024-01-08)
 94 | 
 95 | ### Documentation
 96 | 
 97 | * docs: starting readme ([`bd798dc`](https://github.com/philippe2803/contentmap/commit/bd798dcc94226b1894163bd58cf0d3e9d599361b))
 98 | 
 99 | ### Feature
100 | 
101 | * feat: initial commit ([`34abe5e`](https://github.com/philippe2803/contentmap/commit/34abe5e3acd10422380ef231b016afb0ebca7e50))
102 | 
103 | ### Unknown
104 | 
105 | * Adding workflow ([`a0ec24f`](https://github.com/philippe2803/contentmap/commit/a0ec24f2821268b8918975703ce064650088dd03))
106 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | 
 4 | RUN apt update
 5 | RUN apt install -y python3-pip libgomp1 libatlas-base-dev liblapack-dev libsqlite3-dev
 6 | 
 7 | WORKDIR /app
 8 | 
 9 | ADD poetry.lock /app/poetry.lock
10 | ADD pyproject.toml /app/pyproject.toml
11 | 
12 | RUN pip install poetry
13 | RUN poetry config virtualenvs.create false
14 | RUN poetry install
15 | 
16 | RUN python3 -c 'from sentence_transformers import SentenceTransformer; embedder = SentenceTransformer("all-MiniLM-L6-v2")'
17 | 
18 | 
19 | ADD . /app
20 | 
21 | CMD ["pytest", "./tests"]
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Content map
 2 | 
 3 | A way to share content from a specific domain using SQLite as an alternative to 
 4 | RSS feeds. The purpose of this library is to simply create a dataset for all the
 5 | content on your website, using the XML sitemap as a starting point.  
 6 | 
 7 | Possibility to include vector search similarity features in the dataset very easily.
 8 | 
 9 | Article that explains the rationale behind this type of datasets [here](https://philippeoger.com/pages/can-we-rag-the-whole-web/).
10 | 
11 | 
12 | ## Installation
13 | 
14 | ```bash
15 | 
16 | pip install contentmap
17 | 
18 | ```
19 | 
20 | ## Quickstart
21 | 
22 | To build your contentmap.db with vector search capabilities and containing all 
23 | your content using your XML sitemap as a starting point, you only need to write the
24 | following: 
25 | 
26 | ```python
27 | from contentmap.sitemap import SitemapToContentDatabase
28 | 
29 | database = SitemapToContentDatabase(
30 |     sitemap_sources=["https://yourblog.com/sitemap.xml"],
31 |     concurrency=10,
32 |     include_vss=True
33 | )
34 | database.build()
35 | 
36 | ```
37 | 
38 | This will automatically create the SQLite database file, with vector search 
39 | capabilities (piggybacking on sqlite-vss integration on Langchain).
40 | 
41 | Thanks to @medoror for contributing.
42 | 


--------------------------------------------------------------------------------
/contentmap/__init__.py:
--------------------------------------------------------------------------------
1 | from contentmap.core import ContentMapCreator
2 | from contentmap.sitemap import SitemapToContentDatabase
3 | 


--------------------------------------------------------------------------------
/contentmap/core.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | from datetime import datetime
 3 | import sqlite3
 4 | from contentmap.vss import ContentMapVSS
 5 | 
 6 | 
 7 | class ContentMapCreator:
 8 | 
 9 |     def __init__(
10 |             self,
11 |             contents: List[Dict[str, str]],
12 |             database: str = "contentmap.db",
13 |             include_vss: bool = False
14 |     ):
15 |         self.contents = contents
16 |         self.include_vss = include_vss
17 |         self.connection = sqlite3.connect(database)
18 |         self.connection.row_factory = sqlite3.Row
19 | 
20 |         if self.include_vss:
21 |             import sqlite_vss
22 |             self.connection.enable_load_extension(True)
23 |             sqlite_vss.load(self.connection)
24 |             self.connection.enable_load_extension(False)
25 | 
26 |         self.cursor = self.connection.cursor()
27 | 
28 |     def init_db(self):
29 |         self.cursor.execute("CREATE TABLE IF NOT EXISTS content (url, content)")
30 |         self.cursor.execute("CREATE TABLE IF NOT EXISTS config (cat, value)")
31 |         self.connection.commit()
32 | 
33 |     def add_config(self):
34 |         data = [
35 |             {"Generated with:": "Contentmap lib"},
36 |             {"Date:": datetime.now().strftime("%Y-%m-%d %H:%M:%S")},
37 |             {"Embeddings:": "all-MiniLM-L6-v2"}
38 |         ]
39 |         data = [{"cat": k, "value": v} for row in data for k, v in row.items()]
40 |         self.cursor.executemany("INSERT INTO config VALUES (:cat, :value)", data)
41 |         self.connection.commit()
42 | 
43 |     def build(self):
44 |         self.init_db()
45 |         self.add_config()
46 |         self.cursor.executemany(
47 |             "INSERT INTO content VALUES (:url, :content)",
48 |             self.contents
49 |         )
50 |         self.connection.commit()
51 | 
52 |         if self.include_vss:
53 |             self.add_vss()
54 | 
55 |     def add_vss(self):
56 |         cm_vss = ContentMapVSS(connection=self.connection)
57 |         cm_vss.load()
58 | 


--------------------------------------------------------------------------------
/contentmap/sitemap.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | from typing import Literal
 4 | import requests
 5 | import os
 6 | 
 7 | import aiohttp
 8 | import trafilatura
 9 | from tqdm.asyncio import tqdm_asyncio
10 | from lxml import etree
11 | 
12 | from contentmap.core import ContentMapCreator
13 | 
14 | 
15 | class SitemapToContentDatabase:
16 |     SOURCE_TYPE_URL: Literal['url'] = 'url'
17 |     SOURCE_TYPE_DISK: Literal['disk'] = 'disk'
18 |     SourceType = Literal['url', 'disk']
19 | 
20 |     def __init__(self, sitemap_sources: list,
21 |                  source_type: SourceType = SOURCE_TYPE_URL,
22 |                  seconds_timeout=10,
23 |                  concurrency=None,
24 |                  include_vss=False):
25 |         self.sitemap_sources = sitemap_sources
26 |         self.source_type = source_type
27 |         self.semaphore = asyncio.Semaphore(concurrency) if concurrency is not None else None
28 |         self.timeout = aiohttp.ClientTimeout(
29 |             sock_connect=seconds_timeout,
30 |             sock_read=seconds_timeout
31 |         )
32 |         self.include_vss = include_vss
33 | 
34 |     def build(self):
35 |         urls = self.get_urls()
36 |         loop = asyncio.get_event_loop()
37 |         contents = loop.run_until_complete(self.get_contents(urls))
38 |         cm = ContentMapCreator(contents, include_vss=self.include_vss)
39 |         cm.build()
40 | 
41 |     def get_urls(self):
42 |         all_urls = []
43 |         if self.source_type == self.SOURCE_TYPE_URL:
44 |             for sitemap_url in self.sitemap_sources:
45 |                 urls = self._get_urls_from_url(sitemap_url)
46 |                 all_urls.extend(urls)
47 |         elif self.source_type == self.SOURCE_TYPE_DISK:
48 |             for directory in self.sitemap_sources:
49 |                 for filename in os.listdir(directory):
50 |                     if filename.endswith('.xml'):
51 |                         filepath = os.path.join(directory, filename)
52 |                         urls = self._get_urls_from_disk(filepath)
53 |                         all_urls.extend(urls)
54 |         return all_urls
55 | 
56 |     def _get_urls_from_url(self, sitemap_url):
57 |         r = requests.get(sitemap_url)
58 |         tree = etree.fromstring(r.content)
59 |         return self._extract_urls_from_tree(tree)
60 | 
61 |     def _get_urls_from_disk(self, filepath):
62 |         tree = etree.parse(filepath)
63 |         return self._extract_urls_from_tree(tree)
64 | 
65 |     def _extract_urls_from_tree(self, tree):
66 |         return [
67 |             url.text for url
68 |             in tree.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
69 |         ]
70 | 
71 |     async def get_contents(self, urls):
72 |         async with aiohttp.ClientSession(timeout=self.timeout) as session:
73 |             tasks = [self.fetch_content(session, url) for url in urls]
74 |             return await tqdm_asyncio.gather(*tasks)
75 | 
76 |     async def fetch_content(self, session, url):
77 |         try:
78 |             if not self.semaphore:
79 |                 async with session.get(url) as response:
80 |                     raw = await response.text()
81 |             else:
82 |                 async with self.semaphore, session.get(url) as response:
83 |                     raw = await response.text()
84 |             content = trafilatura.extract(raw)
85 |             return {"url": url, "content": content}
86 | 
87 |         except aiohttp.ClientConnectionError as e:
88 |             logging.error(f"Error while fetching {url}: {e.__repr__}")
89 |             return None
90 | 


--------------------------------------------------------------------------------
/contentmap/vss.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Class ContentMapVSS to create vector search dataset from a contentmap
 3 | dataset already created.
 4 | """
 5 | import sqlite3
 6 | from typing import Optional
 7 | 
 8 | from langchain.text_splitter import CharacterTextSplitter
 9 | from langchain_community.embeddings.sentence_transformer import (
10 |     SentenceTransformerEmbeddings,
11 | )
12 | from langchain_community.vectorstores import SQLiteVSS
13 | 
14 | 
15 | class ContentMapVSS:
16 | 
17 |     def __init__(self,
18 |                  connection: Optional[sqlite3.Connection] = None,
19 |                  db_file: str = "contentmap.db"
20 |                  ):
21 | 
22 |         self.connection = connection
23 |         if not connection:
24 |             self.connection = SQLiteVSS.create_connection(db_file)
25 | 
26 |         embedding_function = SentenceTransformerEmbeddings(
27 |             model_name="all-MiniLM-L6-v2"
28 |         )
29 |         self.vss = SQLiteVSS(
30 |             table="content_chunks",
31 |             embedding=embedding_function,
32 |             connection=self.connection
33 |         )
34 | 
35 |     def load(self):
36 |         # content table must be there
37 |         assert self.table_exists(table_name="content")
38 |         texts, metadatas = self.prepare_texts_and_metadatas()
39 |         self.vss.add_texts(texts=texts, metadatas=metadatas)
40 |         return self.vss
41 | 
42 |     def table_exists(self, table_name: str) -> bool:
43 |         res = self.connection.execute(f"""
44 |             SELECT name 
45 |             FROM sqlite_master 
46 |             WHERE type='table' AND name='{table_name}';
47 |         """)
48 |         rows = res.fetchall()
49 |         if len(rows) == 1:
50 |             return True
51 |         return False
52 | 
53 |     def prepare_texts_and_metadatas(self):
54 |         cursor = self.connection.cursor()
55 |         result = cursor.execute("SELECT content, url FROM content")
56 |         rows = result.fetchall()
57 | 
58 |         # based on Anyscale analysis (https://t.ly/yjgxQ), it looks like the
59 |         # sweet spot is 700 chunk size and 50 chunk overlap.
60 |         text_splitter = CharacterTextSplitter(
61 |             chunk_size=700, chunk_overlap=50, separator="."
62 |         )
63 | 
64 |         texts = []
65 |         metadatas = []
66 |         for row in rows:
67 |             chunks = text_splitter.split_text(row["content"])
68 |             chunk_metadatas = [{"url": row["url"]} for _ in chunks]
69 |             texts += chunks
70 |             metadatas += chunk_metadatas
71 | 
72 |         return texts, metadatas
73 | 
74 |     def similarity_search(self, *args, **kwargs):
75 |         data = self.vss.similarity_search(*args, **kwargs)
76 |         rag_results = []
77 |         for doc in data:
78 |             item = {"content": doc.page_content, "url": doc.metadata['url']}
79 |             rag_results.append(item)
80 |         return rag_results
81 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: "3"
2 | services:
3 |   app:
4 |     build:
5 |       context: .
6 |     image: contentmap_local


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "contentmap"
 3 | version = "0.5.0"
 4 | description = ""
 5 | authors = ["Philippe Oger <phil.oger@gmail.com>"]
 6 | readme = "README.md"
 7 | packages = [{include = "contentmap"}]
 8 | 
 9 | 
10 | [tool.poetry.dependencies]
11 | python = "^3.9"
12 | requests = "^2.31.0"
13 | tqdm = "^4.66.1"
14 | lxml = "4.9.4"
15 | trafilatura = "^1.6.4"
16 | aiohttp = "^3.9.1"
17 | sqlite-vss = "^0.1.2"
18 | langchain = "^0.1.8"
19 | sentence-transformers = "^2.3.1"
20 | 
21 | 
22 | [tool.poetry.group.test.dependencies]
23 | pytest = "^7.4.4"
24 | 
25 | [build-system]
26 | requires = ["poetry-core"]
27 | build-backend = "poetry.core.masonry.api"
28 | 
29 | 
30 | [tool.semantic_release]
31 | version_toml = [
32 |     "pyproject.toml:tool.poetry.version"
33 | ]
34 | 
35 | branch = "main"
36 | upload_to_PyPI = true
37 | upload_to_release = true
38 | build_command = "pip install poetry && poetry build"
39 | commit_author = "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
40 | 
41 | 
42 | [tool.semantic_release.commit_parser_options]
43 | allowed_tags = [
44 |     "build",
45 |     "chore",
46 |     "ci",
47 |     "docs",
48 |     "feat",
49 |     "fix",
50 |     "perf",
51 |     "style",
52 |     "refactor",
53 |     "test"
54 | ]
55 | minor_tags = ["feat"]
56 | patch_tags = ["fix", "perf"]


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philippe2803/contentmap/c9e3b4371b2809e5ae50ddbd93d1c4e89ea40fb5/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import os.path as op
 4 | import logging
 5 | 
 6 | 
 7 | @pytest.fixture(autouse=True)
 8 | def remove_created_database_after_test():
 9 |     """Fixture to execute asserts before and after a test is run"""
10 |     # Setup logic
11 |     yield   # this is where the testing happens
12 |     # Teardown logic
13 | 
14 |     contentmap_db_path = op.join(op.dirname(__file__), "contentmap.db")
15 |     if op.exists(contentmap_db_path):
16 |         logging.info('Destroying mock sqlite content instance')
17 |         os.remove(contentmap_db_path)
18 | 
19 | 


--------------------------------------------------------------------------------
/tests/fixture.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philippe2803/contentmap/c9e3b4371b2809e5ae50ddbd93d1c4e89ea40fb5/tests/fixture.db


--------------------------------------------------------------------------------
/tests/fixtures/sitemap_folder_a/sitemap_a.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
3 |   <url>
4 |     <loc>https://www.example.com/docs/en/example/?topic=testing</loc>
5 |   </url>
6 |   <url>
7 |     <loc>https://www.example.com/docs/en/example/?topic=contact-us</loc>
8 |   </url>
9 | </urlset>


--------------------------------------------------------------------------------
/tests/fixtures/sitemap_folder_b/sitemap_b.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
3 |   <url>
4 |     <loc>https://www.example.com/docs/en/example/?topic=library-overview</loc>
5 |   </url>
6 |   <url>
7 |     <loc>https://www.example.com/docs/en/example/?topic=about-this-content</loc>
8 |   </url>
9 | </urlset>


--------------------------------------------------------------------------------
/tests/test_creator.py:
--------------------------------------------------------------------------------
 1 | from contentmap.core import ContentMapCreator
 2 | import sqlite3
 3 | 
 4 | 
 5 | data = [
 6 |     {"url": "https://www.google.com", "content": "this is google home page"},
 7 |     {"url": "https://www.google.com/about", "content": "this is google about page"},
 8 | ]
 9 | 
10 | 
11 | def test_generator():
12 |     database = ContentMapCreator(data)
13 |     assert isinstance(database.connection, sqlite3.Connection)
14 |     assert isinstance(database.cursor, sqlite3.Cursor)
15 | 
16 | 
17 | def test_schema():
18 |     database = ContentMapCreator(data)
19 |     database.build()
20 |     query = database.cursor.execute("SELECT count(1) FROM content")
21 |     assert query.fetchone()[0] == 2
22 | 
23 | 
24 | def test_content_creator_vss():
25 |     database = ContentMapCreator(data, include_vss=True)
26 |     database.build()
27 |     query = database.cursor.execute("SELECT count(1) FROM content_chunks")
28 |     assert query.fetchone()[0] == 4
29 | 
30 | 
31 | def test_content_creator_vss_check_chunks():
32 |     database = ContentMapCreator(data, include_vss=True)
33 |     database.build()
34 |     query = "SELECT distinct(tbl_name) FROM sqlite_master"
35 |     result = database.cursor.execute(query)
36 |     found_tables = []
37 |     for row in result:
38 |         found_tables.append(row["tbl_name"])
39 |     assert "content" in found_tables
40 |     assert "content_chunks" in found_tables
41 | 


--------------------------------------------------------------------------------
/tests/test_sitemap.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | import pytest
 4 | 
 5 | from unittest.mock import patch, MagicMock
 6 | from contentmap.sitemap import SitemapToContentDatabase
 7 | 
 8 | 
 9 | class TestSitemapToContentDatabase(unittest.TestCase):
10 |     def create_mock_response(self, content):
11 |         mock_response = MagicMock()
12 |         mock_response.content = content
13 |         return mock_response
14 | 
15 |     def generate_sample_sitemap_xml(self, url):
16 |         return f'''
17 |             <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
18 |                 <url>
19 |                     <loc>{url}</loc>
20 |                 </url>
21 |             </urlset>'''
22 |     @patch('contentmap.sitemap.requests.get')
23 |     def test_get_urls_given_one_sitemap_url(self, mock_get):
24 |         mock_get.return_value = self.create_mock_response(self.generate_sample_sitemap_xml('https://www.example.com/docs/en/example/?topic=testing'))
25 | 
26 |         sitemap_db = SitemapToContentDatabase(sitemap_sources=['https://example.com/sitemap.xml'], source_type='url')
27 |         urls = sitemap_db.get_urls()
28 | 
29 |         self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing'])
30 |         mock_get.assert_called_once_with('https://example.com/sitemap.xml')
31 | 
32 | 
33 |     @patch('contentmap.sitemap.requests.get')
34 |     def test_get_urls_given_multiple_sitemap_urls(self, mock_get):
35 |         mock_get.side_effect = [
36 |             self.create_mock_response(self.generate_sample_sitemap_xml('https://www.example.com/docs/en/example/?topic=testing')),
37 |             self.create_mock_response(self.generate_sample_sitemap_xml('https://www.anotherexample.com/docs/en/example/?topic=contact-us'))
38 |         ]
39 | 
40 |         sitemap_db = SitemapToContentDatabase(sitemap_sources=['https://example.com/sitemap.xml', 'https://anotherexample.com/sitemap.xml'], source_type='url')
41 |         urls = sitemap_db.get_urls()
42 | 
43 |         self.assertEqual(urls, [
44 |             'https://www.example.com/docs/en/example/?topic=testing',
45 |             'https://www.anotherexample.com/docs/en/example/?topic=contact-us'
46 |         ])
47 |         mock_get.assert_any_call('https://example.com/sitemap.xml')
48 |         mock_get.assert_any_call('https://anotherexample.com/sitemap.xml')
49 |         self.assertEqual(mock_get.call_count, 2)
50 | 
51 |     def test_get_urls_given_one_location_on_disk(self):
52 |         sitemap_folder_a_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_a')
53 |         sitemap_db = SitemapToContentDatabase(sitemap_sources=[sitemap_folder_a_path], source_type='disk')
54 |         urls = sitemap_db.get_urls()
55 | 
56 |         self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing',
57 |                                 'https://www.example.com/docs/en/example/?topic=contact-us'
58 |                                 ])
59 | 
60 | 
61 |     def test_get_urls_given_multiple_locations_on_disk(self):
62 |         sitemap_folder_a_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_a')
63 |         sitemap_folder_b_path = os.path.join(os.path.dirname(__file__), 'fixtures', 'sitemap_folder_b')
64 |         sitemap_db = SitemapToContentDatabase(sitemap_sources=[sitemap_folder_a_path, sitemap_folder_b_path], source_type='disk')
65 |         urls = sitemap_db.get_urls()
66 | 
67 |         self.assertEqual(urls, ['https://www.example.com/docs/en/example/?topic=testing',
68 |                                 'https://www.example.com/docs/en/example/?topic=contact-us',
69 |                                 'https://www.example.com/docs/en/example/?topic=library-overview',
70 |                                 'https://www.example.com/docs/en/example/?topic=about-this-content'
71 |                                 ])


--------------------------------------------------------------------------------
/tests/test_vss.py:
--------------------------------------------------------------------------------
 1 | from contentmap.vss import ContentMapVSS
 2 | import os.path as op
 3 | from tests.utils import build_fixture_db
 4 | 
 5 | 
 6 | class TestContentMapVSS:
 7 | 
 8 |     def test_assertion_content_exists(self):
 9 |         fixture_db = op.join(op.dirname(__file__), "fixture.db")
10 |         vss_content = ContentMapVSS(db_file=fixture_db)
11 |         assert vss_content.table_exists(table_name="content") is True
12 | 
13 |     def test_assertion_content_not_exists(self):
14 |         vss_content = ContentMapVSS(db_file=":memory:")
15 |         assert vss_content.table_exists(table_name="content") is False
16 | 
17 | 
18 | class TestVssTablesCreation:
19 | 
20 |     def test_vss_instance(self):
21 |         db = build_fixture_db()
22 |         cm_vss = ContentMapVSS(db_file=db)
23 |         cm_vss.load()
24 |         assert cm_vss.table_exists("content_chunks")
25 | 
26 |     def test_prepare_texts_and_metadatas(self):
27 |         db = build_fixture_db()
28 |         cm_vss = ContentMapVSS(db_file=db)
29 |         texts, metadatas = cm_vss.prepare_texts_and_metadatas()
30 |         assert len(texts) == len(metadatas) >= 1
31 | 
32 |     def test_chunk_table(self):
33 |         db = build_fixture_db()
34 |         cm_vss = ContentMapVSS(db_file=db)
35 |         cm_vss.load()
36 |         assert cm_vss.table_exists("content_chunks")
37 |         cursor = cm_vss.connection.cursor()
38 |         res = cursor.execute("SELECT * FROM content_chunks")
39 |         rows = res.fetchall()
40 |         assert len(rows) >= 15
41 | 
42 |     def test_similarity_search(self):
43 |         db = build_fixture_db()
44 |         cm_vss = ContentMapVSS(db_file=db)
45 |         cm_vss.load()
46 |         data = cm_vss.similarity_search(query="who is Mistral ai company?", k=2)
47 |         assert len(data) == 2
48 |         urls = [doc["url"] for doc in data]
49 |         for url in urls:
50 |             assert url == "https://philippeoger.com/pages/ai-scene-in-europe-last-week/"
51 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import os.path as op
 2 | import shutil
 3 | 
 4 | 
 5 | def build_fixture_db():
 6 |     fixture_db = op.join(op.dirname(__file__), 'fixture.db')
 7 |     dest = op.join(op.dirname(__file__), 'contentmap.db')
 8 |     shutil.copy(fixture_db, dest)
 9 |     return dest
10 | 


--------------------------------------------------------------------------------