├── app ├── __init__.py ├── domain_status.json ├── .gitattributes ├── .dockerignore ├── scraper │ ├── spiders │ │ ├── __init__.py │ │ └── spider.py │ ├── items.py │ ├── pipelines.py │ ├── __init__.py │ ├── middlewares.py │ └── settings.py ├── scrapy.cfg ├── dockerfile ├── .env.example ├── package.json ├── pyproject.toml ├── webpack.config.js ├── gcp_deploy.sh ├── .gitignore ├── main.py ├── static │ └── chat-bubble.js ├── models.py └── interface │ └── chat-bubble.js ├── .gitignore ├── CONTRIBUTING.md ├── .env.example ├── demo.html ├── docker-compose.yml ├── README.md └── LICENSE /app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/domain_status.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .venv 3 | .mypy_cache/ 4 | __pycache__/ 5 | .DS_Store 6 | gcp_vm.md -------------------------------------------------------------------------------- /app/.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /app/.dockerignore: -------------------------------------------------------------------------------- 1 | .env 2 | .venv 3 | interface/ 4 | webpack.config.js 5 | __pycache__/ 6 | .mypy_cache/ 7 | notebooks/ 8 | data/ -------------------------------------------------------------------------------- /app/scraper/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /app/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scraper.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scraper 12 | -------------------------------------------------------------------------------- /app/scraper/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class Scraper1Item(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pass 13 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributing Guidelines 2 | 3 | Thank you for contributing! 🎉 Please follow these steps: 4 | 5 | - Fork the repository and create a branch (feature/your-feature). 6 | - Write clean, well-documented code. 7 | - Submit a Pull Request (PR) with a clear description and link to the issue. 8 | - Respond to feedback and make changes as needed. 9 | -------------------------------------------------------------------------------- /app/dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM python:3.11 3 | 4 | WORKDIR /app 5 | 6 | RUN pip install poetry 7 | COPY ./pyproject.toml /app/pyproject.toml 8 | COPY ./poetry.lock /app/poetry.lock 9 | RUN poetry config virtualenvs.create false 10 | RUN poetry install --no-interaction --no-ansi --no-root 11 | 12 | COPY . /app 13 | 14 | EXPOSE 8080 15 | CMD ["python", "-m", "main"] -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | URL=https://www.example.com # Your assistant will know everything about this URL 2 | 3 | # To add: 4 | MISTRAL_API_KEY=... 5 | PHOSPHO_API_KEY=... 6 | PHOSPHO_PROJECT_ID=... 7 | 8 | # Advanced config (Optional ) 9 | ORIGINS='["*"]' # Used for CORS policy. Note: this string is evaluated to an array. 10 | SERVER_URL=http://localhost:8080 # The URL of the server -------------------------------------------------------------------------------- /app/.env.example: -------------------------------------------------------------------------------- 1 | URL=https://www.example.com # Your assistant will know everything about this URL 2 | 3 | # To add: 4 | MISTRAL_API_KEY=... 5 | PHOSPHO_API_KEY=... 6 | PHOSPHO_PROJECT_ID=... 7 | 8 | # Advanced config (Optional ) 9 | ORIGINS='["*"]' # Used for CORS policy. Note: this string is evaluated to an array. 10 | SERVER_URL=http://localhost:8080 # The URL of the server -------------------------------------------------------------------------------- /app/scraper/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class Scraper1Pipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /app/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "@babel/preset-react": "^7.25.9", 4 | "babel-loader": "^9.2.1", 5 | "dotenv": "^16.4.5", 6 | "lucide-react": "^0.454.0", 7 | "react": "^18.3.1", 8 | "react-dom": "^18.3.1", 9 | "webpack": "^5.96.1", 10 | "webpack-cli": "^5.1.4" 11 | }, 12 | "devDependencies": { 13 | "@babel/cli": "^7.25.9", 14 | "@babel/core": "^7.26.0", 15 | "@babel/preset-env": "^7.26.0" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 |Look, you can now chat with an AI assistant here.
19 | 20 | 21 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | qdrant: 5 | image: qdrant/qdrant:latest 6 | container_name: qdrant 7 | ports: 8 | - "6333:6333" 9 | volumes: 10 | - qdrant_data:/qdrant/storage 11 | 12 | app: 13 | build: 14 | context: ./app 15 | dockerfile: Dockerfile 16 | container_name: python_app 17 | ports: 18 | - "8080:8080" 19 | depends_on: 20 | - qdrant 21 | environment: 22 | PYTHONPATH: /app 23 | QDRANT_HOST: qdrant 24 | QDRANT_PORT: 6333 25 | QDRANT_API_KEY: ${QDRANT_API_KEY} 26 | QDRANT_LOCATION: ${QDRANT_LOCATION} 27 | URL: ${URL} 28 | MISTRAL_API_KEY: ${MISTRAL_API_KEY} 29 | PHOSPHO_API_KEY: ${PHOSPHO_API_KEY} 30 | PHOSPHO_PROJECT_ID: ${PHOSPHO_PROJECT_ID} 31 | 32 | volumes: 33 | qdrant_data: 34 | -------------------------------------------------------------------------------- /app/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "chat-extension" 3 | version = "0.1.0" 4 | description = "Interatct with any website through a chatbot" 5 | authors = ["frederic.legrand", "wandrille.flamant"] 6 | readme = "README.md" 7 | packages = [{ include = "app" }] 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.11,<3.13" 11 | fastapi = "^0.115.0" 12 | uvicorn = "^0.31.0" 13 | python-dotenv = "^1.0.1" 14 | scrapy = "^2.11.2" 15 | qdrant-client = "^1.11.3" 16 | mistralai = "^1.1.0" 17 | beautifulsoup4 = "^4.12.3" 18 | pandas = "^2.2.3" 19 | loguru = "^0.7.2" 20 | phospho = "^0.3.44" 21 | llama-index = "^0.12.25" 22 | llama-index-vector-stores-qdrant = "^0.4.0" 23 | llama-index-embeddings-mistralai = "^0.3.0" 24 | fastapi-simple-rate-limiter = "^0.0.4" 25 | 26 | [tool.poetry.group.dev.dependencies] 27 | mypy = "^1.11.2" 28 | 29 | [build-system] 30 | requires = ["poetry-core"] 31 | build-backend = "poetry.core.masonry.api" 32 | -------------------------------------------------------------------------------- /app/webpack.config.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const dotenv = require('dotenv'); 3 | const webpack = require('webpack'); 4 | 5 | dotenv.config(); 6 | 7 | module.exports = { 8 | entry: './interface/chat-bubble.js', // path to your ChatBubble script 9 | output: { 10 | path: path.resolve(__dirname, 'component'), 11 | filename: 'chat-bubble.js', // output bundled file 12 | }, 13 | module: { 14 | rules: [ 15 | { 16 | test: /\.js$/, 17 | exclude: /node_modules/, 18 | use: { 19 | loader: 'babel-loader', 20 | options: { 21 | presets: ['@babel/preset-react'], 22 | }, 23 | }, 24 | }, 25 | ], 26 | }, 27 | plugins: [ 28 | new webpack.DefinePlugin({ 29 | 'process.env.SERVER_URL': JSON.stringify(process.env.SERVER_URL) || JSON.stringify('http://localhost:8080'), 30 | }), 31 | ], 32 | mode: 'production', 33 | }; 34 | -------------------------------------------------------------------------------- /app/gcp_deploy.sh: -------------------------------------------------------------------------------- 1 | 2 | # This file use GCP cloud build to build the image and deploy it to GCP Cloud Run 3 | # For it to work with your GCP project, replace the project id, region, and other variables with your own 4 | 5 | # You will need 6 | # - Qdrant cloud to host your vectors: https://qdrant.tech 7 | # - gcloud CLI installed and authenticated with your GCP account: https://cloud.google.com/sdk/docs/install 8 | # - environment variables in app/.env file (look at app/.env.example for reference) 9 | 10 | # EXAMPLE USAGE: 11 | # gcloud init 12 | # sudo bash app/gcloud_deploy.sh 13 | 14 | echo "Deploying ai-chat-bubble to GCP" 15 | 16 | # GCP builds the image and pushes it to the container registry 17 | gcloud builds submit --region=europe-west1 --tag europe-west1-docker.pkg.dev/portal-385519/phospho-backend/ai-chat-bubble:latest 18 | 19 | # Read the .env file and export the variables 20 | set -a && source .env && set +a 21 | 22 | # Deploy the image to GCP Cloud Run 23 | gcloud run deploy ai-chat-bubble \ 24 | --image=europe-west1-docker.pkg.dev/portal-385519/phospho-backend/ai-chat-bubble:latest \ 25 | --region=europe-west1 \ 26 | --allow-unauthenticated \ 27 | --set-env-vars URL=$URL,PHOSPHO_API_KEY=$PHOSPHO_API_KEY,PHOSPHO_PROJECT_ID=$PHOSPHO_PROJECT_ID \ 28 | --set-env-vars QDRANT_API_KEY=$QDRANT_API_KEY,QDRANT_LOCATION=$QDRANT_LOCATION,ORIGIN=$ORIGIN,MISTRAL_API_KEY=$MISTRAL_API_KEY \ 29 | --memory=1Gi -------------------------------------------------------------------------------- /app/scraper/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from scrapy.crawler import CrawlerProcess # type: ignore 4 | from scrapy.utils.project import get_project_settings # type: ignore 5 | from scraper.spiders.spider import TextContentSpider # type: ignore 6 | 7 | 8 | class ScraperInterface: 9 | """ 10 | scraper logic: 11 | - scrapy project url LinkExtractor (basically a url follower, it will find all the urls in a page and then follow them) 12 | - export all the content to a json exporter, it will export the scraped data to a json file) 13 | - for the json format, check @json_format.py 14 | """ 15 | 16 | def __init__(self, domain, depth): 17 | """ 18 | Initialize the ScraperInterface with domain and depth. 19 | 20 | :param domain: The domain to scrape. 21 | :param depth: The depth of the crawl. 22 | """ 23 | self.domain = domain 24 | self.depth = depth 25 | self.output_path = f"data/{self.domain}.json" 26 | self.spider_db = os.path.join(os.getcwd(), "data") 27 | 28 | def run_crawler(self): 29 | """ 30 | Run the Scrapy crawler to scrape the website. 31 | """ 32 | start_time = time.time() 33 | process = CrawlerProcess(get_project_settings()) 34 | process.crawl( 35 | TextContentSpider, 36 | domain=self.domain, 37 | depth=self.depth, 38 | output_path=self.output_path, 39 | db_path=self.spider_db, 40 | ) 41 | process.start() # Start the reactor and perform all crawls 42 | end_time = time.time() 43 | print(f"Time taken: {end_time - start_time} seconds") 44 | -------------------------------------------------------------------------------- /app/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Project Specific 4 | data/* 5 | .DS_Store 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | cover/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | # For a library or package, you might want to ignore these files since the code is 93 | # intended to run in multiple environments; otherwise, check them in: 94 | # .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/#use-with-ide 116 | .pdm.toml 117 | 118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 119 | __pypackages__/ 120 | 121 | # Celery stuff 122 | celerybeat-schedule 123 | celerybeat.pid 124 | 125 | # SageMath parsed files 126 | *.sage.py 127 | 128 | # Environments 129 | .env 130 | .venv 131 | env/ 132 | venv/ 133 | ENV/ 134 | env.bak/ 135 | venv.bak/ 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # mkdocs documentation 145 | /site 146 | 147 | # mypy 148 | .mypy_cache/ 149 | .dmypy.json 150 | dmypy.json 151 | 152 | # Pyre type checker 153 | .pyre/ 154 | 155 | # pytype static type analyzer 156 | .pytype/ 157 | 158 | # Cython debug symbols 159 | cython_debug/ 160 | 161 | # PyCharm 162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 164 | # and can be added to the global gitignore or merged into this file. For a more nuclear 165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 166 | #.idea/ 167 | 168 | node_modules/ 169 | 170 | .env 171 | notebooks/ -------------------------------------------------------------------------------- /app/scraper/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | 12 | class Scraper1SpiderMiddleware: 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, or item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Request or item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info("Spider opened: %s" % spider.name) 57 | 58 | 59 | class Scraper1DownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info("Spider opened: %s" % spider.name) 104 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI chat bubble - custom AI assistant connected to your knowledge 2 | 3 | **Simple and fast AI chat bubble for your HTML website.** The AI assistant can answer questions about a website's content using RAG, streaming, and the Mistral model. Compatible with **React** and **Wordpress**! 4 | 5 | **How does it work ?** 6 | 7 | 1. Run the backend to create an assistant with knowledge about your website's content 8 | 2. Add a code snippet to your HTML frontend 9 | 3. Your users can now chat with an assistant in an AI chat bubble! 10 | 11 | **Production-ready** 12 | 13 | You can host the AI chat bubble on your own machine with a simple `docker-compose up --build`. 14 | See what users are asking thanks to [phospho analytics](https://phospho.ai) already integrated. 15 | 16 |  17 | 18 | ## Quickstart 19 | 20 | ### 1. Setup .env 21 | 22 | Clone this repository. 23 | 24 | ```bash 25 | # clone using the web url 26 | git clone https://github.com/phospho-app/ai-chat-bubble.git 27 | ``` 28 | 29 | Then, create a `.env` file at the root with this content: 30 | 31 | ```bash 32 | URL=https://www.example.com # Your assistant will know everything about this URL 33 | 34 | # To add: 35 | MISTRAL_API_KEY=... 36 | PHOSPHO_API_KEY=... 37 | PHOSPHO_PROJECT_ID=... 38 | ``` 39 | 40 | In `URL`, put the website with the relevant content you want the AI assistant to know about. 41 | The assistant will crawl domains with a depth of 3 (this is customizable). 42 | 43 | #### External services 44 | 45 | - **LLM:** We use the Mistral AI model - _mistral-large-latest_. Get your `MISTRAL_API_KEY` [here](https://mistral.ai). 46 | - **Analytics:** Messages are logged to phospho. Get your `PHOSPHO_API_KEY` and your `PHOSPHO_PROJECT_ID` [here](https://platform.phospho.ai). 47 | 48 | ### 2. Run the assistant backend 49 | 50 | To deploy the backend of the AI chat bubble, this repository uses [docker compose](https://docs.docker.com/compose/). [Follow this guide to install docker compose](https://docs.docker.com/compose/install/), then run the assistant's backend: 51 | 52 | ```bash 53 | cd ai-chat-bubble # the name of the clone repo 54 | docker-compose up --build 55 | ``` 56 | 57 | Questions are sent to the assistant using the POST API endpoint `/question_on_url`. This returns a streamable response. Go to [localhost:8080/docs](localhost:8080/docs) for more details. 58 | 59 | ### 3. Add the chat bubble to your website 60 | 61 | Add the chat bubble to your website with this snippet in a HTML component: 62 | 63 | ```html 64 | 65 | ``` 66 | 67 | If you just wan to test your assistant, you simply need to open the `demo.html` file in your browser. 68 | 69 | Look into advanced configuration to change its style. 70 | 71 | ## Advanced configuration 72 | 73 | ### Change the chat bubble UI 74 | 75 | The file `component/chat-bubble.js` contains the AI chat bubble style. It is served as a static file and is the compiled version of `interface/chat-bubble.js`. 76 | 77 | To change the AI chat bubble, edit the `interface/chat-bubble.js` and then run `npx webpack` in the folder _app_ of the repo. 78 | 79 | ### CORS policy 80 | 81 | In production, it's best to setup a restrictive CORS policy to allow only your frontend to call your AI assistant backend. To do this, add an `ORIGINS` list in your `.env`. 82 | 83 | ``` 84 | ORIGINS = ["http://localhost:3000", "http://localhost:3001"] 85 | ``` 86 | 87 | _Only urls in `ORIGINS` can access the `/question_on_url` endpoint._ 88 | 89 | ### Edit ports 90 | 91 | The docker runs the main app on port _8080_. To change it, add a `SERVER_URL` field in your `.env`. 92 | 93 | ``` 94 | SERVER_URL=your_new_port 95 | ``` 96 | 97 | Then change the source of the interface script: `` 98 | 99 | ### Prompts, AI, vector databases 100 | 101 | The AI assistant of the AI chat bubble uses [Llama Index](https://docs.llamaindex.ai/en/stable/), [Qdrant](https://qdrant.tech/documentation/), and [Mistral](https://docs.mistral.ai). This behaviour is implemented in `models.py`. 102 | 103 | - Edit `ChatMistral` to change the prompts or models 104 | - Edit the `EmbeddingsVS` client to use another Vector store than Qdrant 105 | 106 | ## About 107 | 108 | Made by juniors for juniors in PARIS - phospho team 🥖🇫🇷 109 | 110 | Special thanks to @flamschou, @fred3105, and @oulianov 🧪💚 111 | -------------------------------------------------------------------------------- /app/scraper/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for scraper project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://s.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = "scraper" 11 | 12 | SPIDER_MODULES = ["scraper.spiders"] 13 | NEWSPIDER_MODULE = "scraper.spiders" 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | # USER_AGENT = "scraper (+http://www.yourdomain.com)" 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = True 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | # CONCURRENT_REQUESTS = 32 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | # DOWNLOAD_DELAY = 3 29 | # The download delay setting will honor only one of: 30 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 31 | # CONCURRENT_REQUESTS_PER_IP = 16 32 | 33 | # Disable cookies (enabled by default) 34 | # COOKIES_ENABLED = False 35 | 36 | # Disable Telnet Console (enabled by default) 37 | # TELNETCONSOLE_ENABLED = False 38 | 39 | # Override the default request headers: 40 | # DEFAULT_REQUEST_HEADERS = { 41 | # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 42 | # "Accept-Language": "en", 43 | # } 44 | 45 | # Enable or disable spider middlewares 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 47 | # SPIDER_MIDDLEWARES = { 48 | # "scraper.middlewares.Scraper1SpiderMiddleware": 543, 49 | # } 50 | 51 | # Enable or disable downloader middlewares 52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 53 | # DOWNLOADER_MIDDLEWARES = { 54 | # "scraper.middlewares.Scraper1DownloaderMiddleware": 543, 55 | # } 56 | 57 | # Enable or disable extensions 58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 59 | # EXTENSIONS = { 60 | # "scrapy.extensions.telnet.TelnetConsole": None, 61 | # } 62 | 63 | # Configure item pipelines 64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 65 | # ITEM_PIPELINES = { 66 | # "scraper.pipelines.Scraper1Pipeline": 300, 67 | # } 68 | 69 | # Enable and configure the AutoThrottle extension (disabled by default) 70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 71 | # AUTOTHROTTLE_ENABLED = True 72 | # The initial download delay 73 | # AUTOTHROTTLE_START_DELAY = 5 74 | # The maximum download delay to be set in case of high latencies 75 | # AUTOTHROTTLE_MAX_DELAY = 60 76 | # The average number of requests Scrapy should be sending in parallel to 77 | # each remote server 78 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 79 | # Enable showing throttling stats for every response received: 80 | # AUTOTHROTTLE_DEBUG = False 81 | 82 | # Enable and configure HTTP caching (disabled by default) 83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 84 | # HTTPCACHE_ENABLED = True 85 | # HTTPCACHE_EXPIRATION_SECS = 0 86 | # HTTPCACHE_DIR = "httpcache" 87 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 88 | # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" 89 | 90 | # Set settings whose default value is deprecated to a future-proof value 91 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" 92 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" 93 | FEED_EXPORT_ENCODING = "utf-8" 94 | 95 | 96 | # --- settings config for retry middleware --- 97 | # Enable or ensure the RetryMiddleware is enabled 98 | RETRY_ENABLED = True 99 | 100 | # Include 429 in the retryable status codes 101 | RETRY_HTTP_CODES = [429, 500, 502, 503, 504, 408, 400] 102 | 103 | # Set the maximum number of retry attempts 104 | RETRY_TIMES = 5 # This can be adjusted based on your needs 105 | 106 | # Set the delay between retries 107 | RETRY_PRIORITY_ADJUST = ( 108 | -1 109 | ) # Adjust priority of retries to be higher than other requests 110 | DOWNLOAD_DELAY = 1 # Delay between requests to the same domain 111 | 112 | CONCURRENT_REQUESTS = ( 113 | 32 # or higher, depending on your bandwidth and the server's capacity 114 | ) 115 | COOKIES_ENABLED = False 116 | REDIRECT_MAX_TIMES = 3 117 | -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | from dotenv import load_dotenv 5 | from fastapi import FastAPI, HTTPException 6 | from fastapi.responses import FileResponse, StreamingResponse 7 | from fastapi_simple_rate_limiter import rate_limiter 8 | 9 | sys.path.append(os.path.abspath(os.path.dirname(__file__))) 10 | 11 | from models import QuestionOnUrlRequest 12 | from typing import Dict, Optional 13 | from urllib.parse import urlparse 14 | from models import MainExecute 15 | from contextlib import asynccontextmanager 16 | from fastapi.middleware.cors import CORSMiddleware 17 | from loguru import logger 18 | 19 | load_dotenv() 20 | 21 | # Check that the environment variables are set 22 | assert os.getenv("MISTRAL_API_KEY"), "MISTRAL_API_KEY environment variable not set" 23 | assert os.getenv("URL"), "URL environment variable not set" 24 | 25 | URL = os.getenv("URL") 26 | DOMAIN_STATUS_FILE = "domain_status.json" 27 | DATA_FOLDER = "data" 28 | ORIGINS = os.getenv("ORIGINS", ["*"]) 29 | # Parse the string into an array. Not needed if using load_dotenv 30 | if isinstance(ORIGINS, str): 31 | ORIGINS = eval(ORIGINS) 32 | SERVER_URL = os.getenv("SERVER_URL", "http://localhost:8080") 33 | 34 | host, port = urlparse(SERVER_URL).netloc.split(":") 35 | 36 | 37 | # Dictionary to store the status of each domain 38 | domain_status: Dict[str, str] = {} 39 | 40 | # Dictionary to store MainExecute instances for each domain 41 | domain_instances: Dict[str, MainExecute] = {} 42 | 43 | 44 | def load_domain_status(): 45 | if os.path.exists(DOMAIN_STATUS_FILE): 46 | with open(DOMAIN_STATUS_FILE, "r") as f: 47 | return json.load(f) 48 | else: 49 | return create_json_file() 50 | 51 | 52 | def create_json_file(): 53 | if os.path.exists(DATA_FOLDER): 54 | for filename in os.listdir(DATA_FOLDER): 55 | if filename.endswith(".json"): 56 | domain = filename[:-5] # Remove the .json extension 57 | domain_status[domain] = "completed" 58 | save_domain_status() 59 | return domain_status 60 | 61 | 62 | def save_domain_status(): 63 | with open(DOMAIN_STATUS_FILE, "w") as f: 64 | json.dump(domain_status, f) 65 | 66 | 67 | def initialize_domains(): 68 | global domain_status, domain_instances 69 | domain_status = load_domain_status() 70 | print(f"Loaded domain status: {domain_status}") 71 | for domain, status in domain_status.items(): 72 | print(f"Initializing domain: {domain}") 73 | if status == "completed": 74 | try: 75 | main_execute = MainExecute(domain, load=False) 76 | domain_instances[domain] = main_execute 77 | except Exception as e: 78 | logger.error(f"Failed to initialize domain {domain}: {str(e)}") 79 | domain_status[domain] = f"failed: {str(e)}" 80 | save_domain_status() 81 | 82 | 83 | def submit_url(url: Optional[str]): 84 | if url is None: 85 | raise HTTPException(status_code=400, detail="URL not set") 86 | domain = urlparse(url).netloc 87 | if domain not in domain_status or domain_status[domain] not in [ 88 | "queued", 89 | "processing", 90 | "completed", 91 | ]: 92 | domain_status[domain] = "queued" 93 | save_domain_status() 94 | logger.info(f"Submitting domain: {domain}") 95 | try: 96 | process_domain(domain) 97 | logger.info(f"{domain} indexation completed") 98 | except Exception as e: 99 | logger.error(f"Failed to process domain {domain}: {str(e)}") 100 | domain_status[domain] = f"failed: {str(e)}" 101 | save_domain_status() 102 | else: 103 | logger.info(f"Domain {domain} already being processed") 104 | 105 | 106 | @asynccontextmanager 107 | async def lifespan(app: FastAPI): 108 | # Startup: You can add initialization code here 109 | logger.info("Starting the application") 110 | 111 | initialize_domains() 112 | 113 | logger.info(f"domains: {domain_status.keys()}") 114 | 115 | logger.info(f"URL: {URL}") 116 | 117 | logger.info(f"Server URL: {SERVER_URL}") 118 | 119 | yield # Here the FastAPI application runs 120 | 121 | # Shutdown: You can add cleanup code here if needed 122 | print("Shutting down the application") 123 | 124 | 125 | app = FastAPI(lifespan=lifespan) 126 | 127 | # Add CORS middleware 128 | app.add_middleware( 129 | CORSMiddleware, 130 | allow_origins=ORIGINS 131 | if ORIGINS is not None 132 | else [], # Specifies the origins allowed to access this API 133 | allow_credentials=True, 134 | allow_methods=["*"], # Allows all methods (POST, GET, etc.) 135 | allow_headers=["*"], # Allows all headers 136 | ) 137 | 138 | 139 | @rate_limiter(limit=3, seconds=60) 140 | @app.get("/") 141 | async def health_check(): 142 | return {"status": "ok"} 143 | 144 | 145 | @rate_limiter(limit=3, seconds=60) 146 | # Serve static files 147 | @app.get("/static/chat-bubble.js") 148 | async def serve_component_file(): 149 | file_path = os.path.join("static", "chat-bubble.js") 150 | if os.path.exists(file_path): 151 | return FileResponse(file_path) 152 | return {"error": "File not found"}, 404 153 | 154 | 155 | def process_domain(domain: str): 156 | domain_folder = os.path.join("data", domain) 157 | os.makedirs(domain_folder, exist_ok=True) 158 | logger.info(f"Indexing domain: {domain}") 159 | try: 160 | domain_status[domain] = "processing" 161 | save_domain_status() 162 | main_execute = MainExecute(domain) 163 | domain_instances[domain] = main_execute 164 | domain_status[domain] = "completed" 165 | save_domain_status() 166 | except Exception as e: 167 | domain_status[domain] = f"failed: {str(e)}" 168 | save_domain_status() 169 | 170 | 171 | @rate_limiter(limit=2, seconds=5) 172 | @app.post("/question_on_url") 173 | async def question_on_url(request: QuestionOnUrlRequest): 174 | if URL is None: 175 | raise HTTPException(status_code=400, detail="URL not set") 176 | url = URL 177 | logger.debug(f"Question on URL: {url}") 178 | question = request.question 179 | domain = urlparse(url).netloc 180 | 181 | logger.debug(f"Domain: {domain}") 182 | logger.debug(f"Domains: {domain_instances.keys()}") 183 | 184 | if domain not in domain_instances: 185 | raise HTTPException(status_code=400, detail="Domain not processed yet") 186 | 187 | if domain_status.get(domain) != "completed": 188 | raise HTTPException(status_code=400, detail="Domain processing not completed") 189 | 190 | logger.debug(f"Domains: {domain_instances.keys()}") 191 | main_execute = domain_instances[domain] 192 | 193 | return StreamingResponse(main_execute.ask(question), media_type="text/plain") 194 | 195 | 196 | if __name__ == "__main__": 197 | initialize_domains() 198 | submit_url(URL) 199 | 200 | import uvicorn 201 | 202 | uvicorn.run("main:app", host="0.0.0.0", port=int(port), reload=True) 203 | -------------------------------------------------------------------------------- /app/static/chat-bubble.js: -------------------------------------------------------------------------------- 1 | (()=>{const n=document.createElement("style");n.textContent="\n* {\n box-sizing: border-box;\n margin: 0;\n padding: 0;\n font-family: 'Inter', sans-serif; /* Using a modern sans-serif font like Inter */\n}\n\n/* Chat Bubble (Floating button) */\n.chat-bubble {\n position: fixed;\n bottom: 20px;\n right: 20px;\n width: 60px;\n height: 60px;\n background: #3e8ef7;\n border-radius: 50%;\n cursor: pointer;\n box-shadow: 0 2px 12px rgba(0, 0, 0, 0.1);\n display: flex;\n align-items: center;\n justify-content: center;\n transition: all 0.3s ease;\n z-index: 1000;\n}\n\n.chat-bubble:hover {\n transform: scale(1.1);\n}\n\n.chat-bubble-icon {\n font-size: 28px;\n}\n\n/* Chat Window Modal */\n.chat-window {\n position: fixed;\n bottom: 90px;\n right: 20px;\n width: 350px;\n height: 450px;\n background: #1e1e1e;\n border-radius: 16px;\n box-shadow: 0 12px 40px rgba(0, 0, 0, 0.1);\n display: none;\n flex-direction: column;\n z-index: 1000;\n opacity: 0;\n transform: translateY(30px);\n transition: opacity 0.3s ease, transform 0.3s ease;\n}\n\n.chat-window.active {\n display: flex;\n opacity: 1;\n transform: translateY(0);\n background: #1e1e1e;\n}\n\n/* Header */\n.chat-header {\n padding: 16px;\n background: #1e1e1e;\n color: white;\n font-size: 16px;\n font-weight: 600;\n display: flex;\n justify-content: space-between;\n align-items: center;\n border-top-left-radius: 16px;\n align-self: flex-end;\n border-top-right-radius: 16px;\n box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);\n}\n\n.close-button {\n background: none;\n border: none;\n color: white;\n cursor: pointer;\n font-size: 18px;\n transition: color 0.3s ease;\n}\n\n.close-button:hover {\n color: #3e8ef7;\n}\n\n/* Chat Messages Section */\n.chat-messages {\n flex-grow: 1;\n padding: 20px;\n overflow-y: auto;\n display: flex;\n flex-direction: column;\n gap: 14px;\n background: #1e1e1e;\n}\n\n.message {\n padding: 12px 18px;\n border-radius: 20px;\n max-width: 80%;\n word-wrap: break-word;\n font-size: 14px;\n line-height: 1.4;\n}\n\n.message.sent {\n background: #2A2A2A;\n color: white;\n align-self: flex-end;\n}\n\n.message.received {\n background: #3e8ef7;\n color: white;\n align-self: flex-start;\n}\n\n/* Chat Input Section */\n.chat-input {\n display: flex;\n padding: 16px;\n gap: 12px;\n background: #1e1e1e;\n}\n\n.message-input {\n flex-grow: 1;\n padding: 12px 18px;\n border: 1px solid #e0e0e0;\n border-radius: 30px;\n outline: none;\n font-size: 14px;\n transition: border-color 0.3s ease;\n}\n\n.message-input:focus {\n border-color: #3e8ef7;\n}\n\n.send-button {\n background: #3e8ef7;\n color: white;\n border: none;\n border-radius: 30px;\n padding: 0px 16px;\n cursor: pointer;\n transition: background 0.3s ease;\n font-size: 24px;\n align-items: center;\n}\n\n.send-button:hover {\n background: #3378d1;\n}\n\n.send-button:active {\n background: #2566a0;\n}\n\n\n.phospho-typing-indicator {\n padding: 10px;\n display: flex;\n align-items: center;\n}\n\n.typing-dots {\n display: flex;\n align-self: flex-start;\n}\n\n.typing-dots span {\n height: 8px;\n width: 8px;\n margin: 0 4px;\n background-color: #3e8ef7;\n display: block;\n border-radius: 50%;\n opacity: 0.4;\n animation: typing 1s infinite ease-in-out;\n align-self: flex-start;\n}\n\n.typing-dots span:nth-child(1) {\n animation-delay: 0.1s;\n}\n\n.typing-dots span:nth-child(2) {\n animation-delay: 0.2s;\n}\n\n.typing-dots span:nth-child(3) {\n animation-delay: 0.3s;\n}\n\n.highlighted-link {\n text-decoration: underline;\n color: rgb(255, 255, 255);\n}\n\n/* Copy button styling */\n.copy-btn {\n position: absolute;\n top: 5px;\n right: 5px;\n background-color: #444;\n color: #e6e6e6;\n border: none;\n border-radius: 3px;\n padding: 2px 5px;\n font-size: 12px;\n cursor: pointer;\n}\n\n.copy-btn:hover {\n background-color: #555;\n}\n\n\n/* Code block styling */\npre {\n background-color: #2b2b2b;\n border: 1px solid #444;\n border-radius: 4px;\n padding: 10px;\n overflow-x: auto;\n position: relative;\n}\n\ncode {\n font-family: 'Courier New', Courier, monospace;\n font-size: 14px;\n color: #2b2b2b;\n}\n\n/* Inline code styling */\np code {\n background-color: #2c2f33;\n color: #e6e6e6;\n padding: 2px 4px;\n border-radius: 3px;\n}\n\n/* Fade In Animation */\n.chat-window.fade-in {\n animation: fadeIn 0.3s forwards;\n}\n\n@media (max-width: 767px) {\n .chat-window {\n width: 100%;\n height: 100%;\n border-radius: 0;\n bottom: 0;\n right: 0;\n }\n\n .chat-messages {\n max-height: 70%;\n }\n\n .chat-input {\n flex-wrap: wrap;\n }\n\n .message-input {\n flex-grow: 1;\n width: 100%;\n }\n\n .send-button {\n margin-top: 12px;\n font-size: 40px;\n }\n}\n\n@keyframes typing {\n 0% {\n transform: translateY(0px);\n background-color: #0000ff;\n }\n 28% {\n transform: translateY(-7px);\n background-color: #3e8ef7;\n }\n 44% {\n transform: translateY(0px);\n background-color: #0000ff;\n }\n}\n\n@keyframes fadeIn {\n from {\n opacity: 0;\n transform: translateY(30px);\n }\n to {\n opacity: 1;\n transform: translateY(0);\n }\n}",document.head.appendChild(n);const e=document.createElement("div");e.className="chat-bubble",e.innerHTML='',document.body.appendChild(e);const t=document.createElement("div");t.className="chat-window",t.innerHTML='\n${o=t.trim(),o.replace(/&/g,"&").replace(//g,">").replace(/"/g,""").replace(/'/g,"'")}`;var o}))).replace(/`([^`]+)`/g,"$1")).replace(/\$\$([\s\S]*?)\$\$/g,((n,e)=>`"+n+"
"}))}(n);e.querySelector("p").innerHTML=t,e.querySelectorAll("pre").forEach((n=>{if(!n.querySelector(".copy-btn")){const e=document.createElement("button");e.textContent="Copy",e.className="copy-btn",e.addEventListener("click",(()=>{const t=n.querySelector("code").textContent;navigator.clipboard.writeText(t).then((()=>{e.textContent="Copied!",setTimeout((()=>{e.textContent="Copy"}),2e3)}))})),n.appendChild(e)}})),a.scrollTop=a.scrollHeight}(o),r())};await r()}else d(),console.error("Error: Response is not a StreamingResponse");else d(),console.error("Error with the request:",e.statusText)}catch(n){d(),console.error("Error:",n)}}}l("Hello! How can I assist you today?",!1),e.addEventListener("click",(function(){s=!s,t.classList.toggle("active",s),e.querySelector(".chat-bubble-icon").textContent=s?"⌑":"💬",s&&r.focus()})),o.addEventListener("click",c),i.addEventListener("click",p),r.addEventListener("keypress",(n=>{"Enter"===n.key&&p()})),document.addEventListener("click",(n=>{!s||t.contains(n.target)||e.contains(n.target)||c()}))})(); -------------------------------------------------------------------------------- /app/scraper/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import json 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from bs4 import BeautifulSoup 6 | import re 7 | import datetime 8 | import hashlib 9 | import os 10 | import pandas 11 | import uuid 12 | from llama_index.embeddings.mistralai import MistralAIEmbedding 13 | 14 | 15 | class TextContentSpider(CrawlSpider): 16 | name = "crawler" 17 | rules = (Rule(LinkExtractor(allow=()), callback="parse_response", follow=True),) 18 | 19 | def __init__( 20 | self, 21 | domain: str = "", 22 | depth: int = 1, 23 | db_path: str = "../data", 24 | *args, 25 | **kwargs, 26 | ): 27 | super(TextContentSpider, self).__init__(*args, **kwargs) 28 | self.allowed_domains = [domain] 29 | self.start_urls = [f"https://{domain}/"] 30 | self.depth_limit = int(depth) 31 | self.results = [] 32 | self.status_counts = {} 33 | self.chunk_size = 1024 34 | self.db_path = db_path 35 | self.db_file = os.path.join(self.db_path, f"{domain}.json") 36 | self.embeddings_model = MistralAIEmbedding( 37 | api_key=os.getenv("MISTRAL_API_KEY"), 38 | model_name="mistral-embed", 39 | ) 40 | self.embeddings_model_name = "mistral-embed" 41 | 42 | # browser config 43 | self.browser_headless = False 44 | 45 | self.load_database() 46 | 47 | def update_database(self): 48 | with open(self.db_file, "r") as file: 49 | db_data = json.load(file) 50 | file.close() 51 | with open(self.db_file, "w") as file: 52 | db_data["data"] = self.database.to_dict(orient="records") 53 | self.database = pandas.DataFrame(data=db_data["data"]) 54 | json.dump(db_data, file, indent=4) 55 | file.close() 56 | 57 | def start_requests(self): 58 | for url in self.start_urls: 59 | yield scrapy.Request(url, self.parse_response, meta={"depth": 0}) 60 | 61 | def load_database(self): 62 | """ 63 | here, it also depends if we want to take the chunk size in counts, i assume we just skip this 64 | """ 65 | if os.path.exists(self.db_file): 66 | try: 67 | self.logger.info(f"Loading database from {self.db_file}") 68 | with open(self.db_file, "r") as file: 69 | self.database = json.load(file) 70 | self.database = pandas.DataFrame(data=self.database["data"]) 71 | file.close() 72 | except: 73 | self.logger.info(f"Error loading database from {self.db_file}") 74 | self.database = pandas.DataFrame() 75 | else: 76 | self.logger.info(f"No database found at {self.db_file}, creating new one.") 77 | self.database = pandas.DataFrame( 78 | columns=[ 79 | "url", 80 | "id", 81 | "full_text", 82 | "content_hash", 83 | "chunked_text", 84 | "last_time_crawled", 85 | "status", 86 | ] 87 | ) 88 | headers_ = { 89 | "url": [self.start_urls[0]], 90 | "time": str(datetime.datetime.now()), 91 | "config": { 92 | "depth": self.depth_limit, 93 | "chunk_size": self.chunk_size, 94 | "embedding_model": self.embeddings_model_name, 95 | "allowed_domains": [self.allowed_domains[0]], 96 | }, 97 | "data": [], 98 | } 99 | with open(self.db_file, "w") as file: 100 | json.dump(headers_, file, indent=4) 101 | file.close() 102 | 103 | def parse_text(self, text: str): 104 | soup = BeautifulSoup(text, "html.parser") 105 | for script_or_style in soup(["script", "style"]): 106 | script_or_style.decompose() 107 | clean_text = soup.get_text(separator=" ", strip=True) 108 | lines = [line.strip() for line in clean_text.splitlines()] 109 | cleaned_lines = [line for line in lines if line] 110 | cleaned_text = " ".join(cleaned_lines) 111 | 112 | return cleaned_text 113 | 114 | def get_embeddings(self, text: str, url: str) -> dict: 115 | chunks = self.chunk_text(text) 116 | embeddings = [] 117 | embedeed_sentences = self.embeddings_model.encode( 118 | chunks 119 | ).tolist() # Converted ndarray to list 120 | embeddings = [ 121 | { 122 | "chunk_text": url + ": " + chunk, 123 | "embedding": embedeed_sentences[i], 124 | "id": str(uuid.uuid4()), 125 | } 126 | for i, chunk in enumerate(chunks) 127 | ] 128 | 129 | return {"embeddings": embeddings} 130 | 131 | def chunk_text(self, text: str) -> list: 132 | chunks = [] 133 | current_chunk = "" 134 | sentences = re.split(r"(?<=[.!?]) +", text) 135 | for sentence in sentences: 136 | try: 137 | # f sentence in self.database['full_text'].values: 138 | if any( 139 | sentence in text 140 | for text in self.database["full_text"].values.tolist() 141 | ): 142 | self.logger.warning( 143 | f"Sentence already in database, skipping chunking." 144 | ) 145 | continue 146 | except Exception: 147 | pass 148 | 149 | if len(current_chunk + sentence) <= self.chunk_size: 150 | current_chunk += sentence + " " 151 | else: 152 | if ( 153 | current_chunk.strip() 154 | ): # Check if the current chunk is not empty before appending 155 | chunks.append(current_chunk.strip()) 156 | current_chunk = sentence + " " 157 | 158 | if ( 159 | current_chunk.strip() 160 | ): # Check if the last chunk is not empty before appending 161 | chunks.append(current_chunk.strip()) 162 | 163 | return chunks 164 | 165 | def parse_response(self, response): 166 | if response.meta.get("depth", 0) > self.depth_limit: 167 | self.logger.info(f"Reached depth limit for {response.url}") 168 | return 169 | if response.request.url != response.url: 170 | self.logger.info( 171 | f"Redirected from {response.request.url} to {response.url}" 172 | ) 173 | 174 | status_code = str(response.status) 175 | if status_code not in self.status_counts: 176 | self.status_counts[status_code] = 0 177 | self.status_counts[status_code] += 1 178 | 179 | processed_text = self.parse_text(response.xpath("//body").extract_first()) 180 | if len(processed_text) < 1: 181 | self.logger.info(f"Page is empty, try rendering it") 182 | return self.parse_response(response) 183 | else: 184 | self.logger.info(f"Page is not empty, continue") 185 | 186 | content_hash = hashlib.sha256(processed_text.encode("utf-8")).hexdigest() 187 | try: 188 | filtered_df = self.database[self.database["url"] == response.url] 189 | url_entry = filtered_df.iloc[0] if not filtered_df.empty else None 190 | except Exception as e: 191 | self.logger.error(f"Error accessing database entry: {str(e)}") 192 | url_entry = None 193 | 194 | # print(url_entry) 195 | if url_entry is not None: 196 | db_id = url_entry["id"] 197 | last_time_crawled = url_entry["last_time_crawled"] 198 | if url_entry["content_hash"] != content_hash: 199 | self.logger.info( 200 | f"Content hash mismatch for {response.url}, updating entry." 201 | ) 202 | self.database.loc[ 203 | self.database["url"] == response.url, 204 | [ 205 | "full_text", 206 | "content_hash", 207 | "chunked_text", 208 | "last_time_crawled", 209 | "status", 210 | ], 211 | ] = [ 212 | processed_text, 213 | content_hash, 214 | self.get_embeddings(processed_text, response.url), 215 | last_time_crawled, 216 | status_code, 217 | ] 218 | else: 219 | self.logger.info(f"URL {response.url} is already in the database.") 220 | else: 221 | self.logger.info(f"New URL {response.url}, adding to database.") 222 | new_entry = { 223 | "url": response.url, 224 | "id": str(uuid.uuid4()), 225 | "full_text": processed_text, 226 | "content_hash": content_hash, 227 | "chunked_text": self.get_embeddings(processed_text, response.url), 228 | "last_time_crawled": str(datetime.datetime.now()), 229 | "status": status_code, 230 | } 231 | self.database = pandas.concat( 232 | [self.database, pandas.DataFrame([new_entry])], ignore_index=True 233 | ) 234 | 235 | self.update_database() 236 | 237 | links = LinkExtractor(allow=()).extract_links(response) 238 | for link in links: 239 | current_depth = response.meta.get("depth", 0) 240 | if current_depth < self.depth_limit: 241 | yield scrapy.Request( 242 | link.url, 243 | callback=self.parse_response, 244 | meta={"depth": current_depth + 1}, 245 | errback=self.handle_error, 246 | ) 247 | 248 | def handle_error(self, failure): 249 | if failure.value.response.status == 429: 250 | self.logger.error( 251 | f"Received 429 Too Many Requests from {failure.request.url}" 252 | ) 253 | # Optionally, you can customize retry logic here 254 | 255 | def closed(self, reason): 256 | # self.update_database() 257 | with open(self.db_file, "w") as f: 258 | output_data = { 259 | "url": self.start_urls, 260 | "time": str(datetime.datetime.now()), 261 | "config": { 262 | "depth": self.depth_limit, 263 | "chunk_size": self.chunk_size, 264 | "embedding_model": self.embeddings_model_name, 265 | "allowed_domains": self.allowed_domains, 266 | }, 267 | "data": self.database.to_dict(orient="records"), 268 | } 269 | json.dump(output_data, f, indent=4) 270 | self.logger.info(f"Closed spider with reason: {reason}") 271 | self.logger.info(f"Total requests sent: {len(self.results)}") 272 | self.logger.info(f"Status code counts: {self.status_counts}") 273 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2024 Phospho SAS 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /app/models.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from scraper import ( 3 | TextContentSpider, 4 | ) # load the scraper from our scrapy project 5 | from scrapy.crawler import CrawlerProcess # type:ignore 6 | from scrapy.utils.project import ( # type:ignore 7 | get_project_settings, 8 | ) 9 | from qdrant_client import QdrantClient 10 | from llama_index.vector_stores.qdrant import QdrantVectorStore 11 | from llama_index.core import StorageContext 12 | from llama_index.embeddings.mistralai import MistralAIEmbedding 13 | from llama_index.core import VectorStoreIndex, SimpleDirectoryReader 14 | import time 15 | import json 16 | from mistralai import Mistral, AssistantMessage, ToolMessage 17 | import os 18 | import functools 19 | from typing import Generator 20 | from dotenv import load_dotenv 21 | from typing import List 22 | from pydantic import BaseModel 23 | import phospho 24 | 25 | load_dotenv() 26 | 27 | phospho.init() 28 | 29 | # Check that the environment variables are set 30 | assert os.getenv("MISTRAL_API_KEY"), "MISTRAL_API_KEY environment variable not set" 31 | 32 | 33 | class QuestionOnUrlRequest(BaseModel): 34 | question: str 35 | 36 | 37 | class ScraperInterface: 38 | """ 39 | scraper logic: 40 | - scrapy project url LinkExtractor (basically a url follower, it will find all the urls in a page and then follow them) 41 | - export all the content to a json exporter, it will export the scraped data to a json file) 42 | - for the json format, check @json_format.py 43 | """ 44 | 45 | def __init__(self, domain, depth): 46 | """ 47 | Initialize the ScraperInterface with domain and depth. 48 | 49 | :param domain: The domain to scrape. 50 | :param depth: The depth of the crawl. 51 | """ 52 | self.domain = domain 53 | self.depth = depth 54 | self.output_path = os.path.join(os.getcwd(), "data", f"{domain}.json") 55 | self.spider_db = os.path.join(os.getcwd(), "data") 56 | 57 | def run_crawler(self): 58 | """ 59 | Run the Scrapy crawler to scrape the website. 60 | """ 61 | print("Running crawler") 62 | start_time = time.time() 63 | process = CrawlerProcess(get_project_settings()) 64 | process.crawl( 65 | TextContentSpider, 66 | domain=self.domain, 67 | depth=self.depth, 68 | output_path=self.output_path, 69 | db_path=self.spider_db, 70 | ) 71 | process.start() # Start the reactor and perform all crawls 72 | end_time = time.time() 73 | logger.info(f"Time taken: {end_time - start_time} seconds") 74 | 75 | 76 | class EmbeddingsVS: 77 | def __init__(self, domain): 78 | """ 79 | Initialize the EmbeddingsVS with domain. 80 | it's just some init variable so that we can modify easily which model or parameters to use 81 | 82 | :param domain: The domain to create embeddings for. 83 | """ 84 | self.vector_db_name = domain.replace(".", "_") 85 | self.domain = domain 86 | self.embed_model = MistralAIEmbedding( 87 | model_name="mistral-embed", api_key=os.getenv("MISTRAL_API_KEY") 88 | ) 89 | 90 | if os.getenv("QDRANT_API_KEY") and os.getenv("QDRANT_LOCATION"): 91 | logger.info("Connecting to Qdrant cloud") 92 | try: 93 | self.client = QdrantClient( 94 | api_key=os.getenv("QDRANT_API_KEY"), 95 | location=os.getenv("QDRANT_LOCATION"), 96 | ) 97 | except Exception as e: 98 | logger.error(f"Failed to connect to Qdrant: {str(e)}") 99 | self.client = None 100 | logger.error(f"QDRANT_API_KEY: {os.getenv('QDRANT_API_KEY')}") 101 | logger.error(f"QDRANT_LOCATION: {os.getenv('QDRANT_LOCATION')}") 102 | else: 103 | logger.info("Connecting to Qdrant local") 104 | self.client = QdrantClient( 105 | # you can use :memory: mode for fast and light-weight experiments, 106 | # it does not require to have Qdrant deployed anywhere 107 | # but requires qdrant-client >= 1.1.1 108 | # location=":memory:" 109 | # otherwise set Qdrant instance address with: 110 | # url="http://${escapeHtml(code.trim())}`;
463 | });
464 |
465 | // Process inline code
466 | text = text.replace(/`([^`]+)`/g, '$1');
467 |
468 | // Process LaTeX
469 | text = text.replace(/\$\$([\s\S]*?)\$\$/g, (match, latex) => {
470 | return `'+m+'
'; 492 | }); 493 | 494 | return text; 495 | } 496 | 497 | function escapeHtml(unsafe) { 498 | return unsafe 499 | .replace(/&/g, "&") 500 | .replace(//g, ">") 502 | .replace(/"/g, """) 503 | .replace(/'/g, "'"); 504 | } 505 | 506 | function addCopyButtons(container) { 507 | const codeBlocks = container.querySelectorAll('pre'); 508 | codeBlocks.forEach((block) => { 509 | if (!block.querySelector('.copy-btn')) { 510 | const copyBtn = document.createElement('button'); 511 | copyBtn.textContent = 'Copy'; 512 | copyBtn.className = 'copy-btn'; 513 | copyBtn.addEventListener('click', () => { 514 | const code = block.querySelector('code').textContent; 515 | navigator.clipboard.writeText(code).then(() => { 516 | copyBtn.textContent = 'Copied!'; 517 | setTimeout(() => { 518 | copyBtn.textContent = 'Copy'; 519 | }, 2000); 520 | }); 521 | }); 522 | block.appendChild(copyBtn); 523 | } 524 | }); 525 | } 526 | 527 | // Event listeners 528 | chatBubble.addEventListener("click", toggleChat); 529 | closeButton.addEventListener("click", closeChat); 530 | sendButton.addEventListener("click", handleSendMessage); 531 | 532 | messageInput.addEventListener("keypress", (e) => { 533 | if (e.key === "Enter") { 534 | handleSendMessage(); 535 | } 536 | }); 537 | 538 | // Close chat when clicking outside 539 | document.addEventListener("click", (e) => { 540 | if ( 541 | isOpen && 542 | !chatWindow.contains(e.target) && 543 | !chatBubble.contains(e.target) 544 | ) { 545 | closeChat(); 546 | } 547 | }); 548 | 549 | --------------------------------------------------------------------------------