├── app
    ├── __init__.py
    ├── domain_status.json
    ├── .gitattributes
    ├── .dockerignore
    ├── scraper
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── __init__.py
    │   ├── middlewares.py
    │   └── settings.py
    ├── scrapy.cfg
    ├── dockerfile
    ├── .env.example
    ├── package.json
    ├── pyproject.toml
    ├── webpack.config.js
    ├── gcp_deploy.sh
    ├── .gitignore
    ├── main.py
    ├── static
    │   └── chat-bubble.js
    ├── models.py
    └── interface
    │   └── chat-bubble.js
├── .gitignore
├── CONTRIBUTING.md
├── .env.example
├── demo.html
├── docker-compose.yml
├── README.md
└── LICENSE


/app/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/domain_status.json:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | .venv
3 | .mypy_cache/
4 | __pycache__/
5 | .DS_Store
6 | gcp_vm.md


--------------------------------------------------------------------------------
/app/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/app/.dockerignore:
--------------------------------------------------------------------------------
1 | .env
2 | .venv
3 | interface/
4 | webpack.config.js
5 | __pycache__/
6 | .mypy_cache/
7 | notebooks/
8 | data/


--------------------------------------------------------------------------------
/app/scraper/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/app/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scraper.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scraper
12 | 


--------------------------------------------------------------------------------
/app/scraper/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class Scraper1Item(scrapy.Item):
10 |     # define the fields for your item here like:
11 |     # name = scrapy.Field()
12 |     pass
13 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Contributing Guidelines
2 | 
3 | Thank you for contributing! 🎉 Please follow these steps:
4 | 
5 | - Fork the repository and create a branch (feature/your-feature).
6 | - Write clean, well-documented code.
7 | - Submit a Pull Request (PR) with a clear description and link to the issue.
8 | - Respond to feedback and make changes as needed.
9 | 


--------------------------------------------------------------------------------
/app/dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | FROM python:3.11
 3 | 
 4 | WORKDIR /app
 5 | 
 6 | RUN pip install poetry
 7 | COPY ./pyproject.toml /app/pyproject.toml
 8 | COPY ./poetry.lock /app/poetry.lock
 9 | RUN poetry config virtualenvs.create false
10 | RUN poetry install --no-interaction --no-ansi --no-root
11 | 
12 | COPY . /app
13 | 
14 | EXPOSE 8080
15 | CMD ["python", "-m", "main"]


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | URL=https://www.example.com # Your assistant will know everything about this URL
 2 | 
 3 | # To add:
 4 | MISTRAL_API_KEY=...
 5 | PHOSPHO_API_KEY=...
 6 | PHOSPHO_PROJECT_ID=...
 7 | 
 8 | # Advanced config (Optional )
 9 | ORIGINS='["*"]' # Used for CORS policy. Note: this string is evaluated to an array.
10 | SERVER_URL=http://localhost:8080 # The URL of the server


--------------------------------------------------------------------------------
/app/.env.example:
--------------------------------------------------------------------------------
 1 | URL=https://www.example.com # Your assistant will know everything about this URL
 2 | 
 3 | # To add:
 4 | MISTRAL_API_KEY=...
 5 | PHOSPHO_API_KEY=...
 6 | PHOSPHO_PROJECT_ID=...
 7 | 
 8 | # Advanced config (Optional )
 9 | ORIGINS='["*"]' # Used for CORS policy. Note: this string is evaluated to an array.
10 | SERVER_URL=http://localhost:8080 # The URL of the server


--------------------------------------------------------------------------------
/app/scraper/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | 
11 | class Scraper1Pipeline:
12 |     def process_item(self, item, spider):
13 |         return item
14 | 


--------------------------------------------------------------------------------
/app/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dependencies": {
 3 |     "@babel/preset-react": "^7.25.9",
 4 |     "babel-loader": "^9.2.1",
 5 |     "dotenv": "^16.4.5",
 6 |     "lucide-react": "^0.454.0",
 7 |     "react": "^18.3.1",
 8 |     "react-dom": "^18.3.1",
 9 |     "webpack": "^5.96.1",
10 |     "webpack-cli": "^5.1.4"
11 |   },
12 |   "devDependencies": {
13 |     "@babel/cli": "^7.25.9",
14 |     "@babel/core": "^7.26.0",
15 |     "@babel/preset-env": "^7.26.0"
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/demo.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>Demo Website</title>
 5 |     <style>
 6 |       body {
 7 |         font-family: Arial, sans-serif;
 8 |         margin: 20px;
 9 |       }
10 |     </style>
11 |     <!-- Add this for emoji support-->
12 |     <meta charset="UTF-8" />
13 |   </head>
14 |   <!-- TODO: Replace this URL when you've deployed the backend  -->
15 |   <script src="http:localhost:8080/component/chat-bubble.js" async></script>
16 |   <body>
17 |     <h1>This is a demo website</h1>
18 |     <p>Look, you can now chat with an AI assistant here.</p>
19 |   </body>
20 | </html>
21 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   qdrant:
 5 |     image: qdrant/qdrant:latest
 6 |     container_name: qdrant
 7 |     ports:
 8 |       - "6333:6333"
 9 |     volumes:
10 |       - qdrant_data:/qdrant/storage
11 | 
12 |   app:
13 |     build:
14 |       context: ./app
15 |       dockerfile: Dockerfile
16 |     container_name: python_app
17 |     ports:
18 |       - "8080:8080"
19 |     depends_on:
20 |       - qdrant
21 |     environment:
22 |       PYTHONPATH: /app
23 |       QDRANT_HOST: qdrant
24 |       QDRANT_PORT: 6333
25 |       QDRANT_API_KEY: ${QDRANT_API_KEY}
26 |       QDRANT_LOCATION: ${QDRANT_LOCATION}
27 |       URL: ${URL}
28 |       MISTRAL_API_KEY: ${MISTRAL_API_KEY}
29 |       PHOSPHO_API_KEY: ${PHOSPHO_API_KEY}
30 |       PHOSPHO_PROJECT_ID: ${PHOSPHO_PROJECT_ID}
31 | 
32 | volumes:
33 |   qdrant_data:
34 | 


--------------------------------------------------------------------------------
/app/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "chat-extension"
 3 | version = "0.1.0"
 4 | description = "Interatct with any website through a chatbot"
 5 | authors = ["frederic.legrand", "wandrille.flamant"]
 6 | readme = "README.md"
 7 | packages = [{ include = "app" }]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = ">=3.11,<3.13"
11 | fastapi = "^0.115.0"
12 | uvicorn = "^0.31.0"
13 | python-dotenv = "^1.0.1"
14 | scrapy = "^2.11.2"
15 | qdrant-client = "^1.11.3"
16 | mistralai = "^1.1.0"
17 | beautifulsoup4 = "^4.12.3"
18 | pandas = "^2.2.3"
19 | loguru = "^0.7.2"
20 | phospho = "^0.3.44"
21 | llama-index = "^0.12.25"
22 | llama-index-vector-stores-qdrant = "^0.4.0"
23 | llama-index-embeddings-mistralai = "^0.3.0"
24 | fastapi-simple-rate-limiter = "^0.0.4"
25 | 
26 | [tool.poetry.group.dev.dependencies]
27 | mypy = "^1.11.2"
28 | 
29 | [build-system]
30 | requires = ["poetry-core"]
31 | build-backend = "poetry.core.masonry.api"
32 | 


--------------------------------------------------------------------------------
/app/webpack.config.js:
--------------------------------------------------------------------------------
 1 | const path = require('path');
 2 | const dotenv = require('dotenv');
 3 | const webpack = require('webpack');
 4 | 
 5 | dotenv.config();
 6 | 
 7 | module.exports = {
 8 |   entry: './interface/chat-bubble.js', // path to your ChatBubble script
 9 |   output: {
10 |     path: path.resolve(__dirname, 'component'),
11 |     filename: 'chat-bubble.js', // output bundled file
12 |   },
13 |   module: {
14 |     rules: [
15 |       {
16 |         test: /\.js$/,
17 |         exclude: /node_modules/,
18 |         use: {
19 |           loader: 'babel-loader',
20 |           options: {
21 |             presets: ['@babel/preset-react'],
22 |           },
23 |         },
24 |       },
25 |     ],
26 |   },
27 |   plugins: [
28 |     new webpack.DefinePlugin({
29 |       'process.env.SERVER_URL': JSON.stringify(process.env.SERVER_URL) || JSON.stringify('http://localhost:8080'),
30 |     }),
31 |   ],
32 |   mode: 'production',
33 | };
34 | 


--------------------------------------------------------------------------------
/app/gcp_deploy.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # This file use GCP cloud build to build the image and deploy it to GCP Cloud Run
 3 | # For it to work with your GCP project, replace the project id, region, and other variables with your own
 4 | 
 5 | # You will need 
 6 | # - Qdrant cloud to host your vectors: https://qdrant.tech 
 7 | # - gcloud CLI installed and authenticated with your GCP account: https://cloud.google.com/sdk/docs/install
 8 | # - environment variables in app/.env file (look at app/.env.example for reference)
 9 | 
10 | # EXAMPLE USAGE: 
11 | #      gcloud init
12 | #      sudo bash app/gcloud_deploy.sh
13 | 
14 | echo "Deploying ai-chat-bubble to GCP"
15 | 
16 | # GCP builds the image and pushes it to the container registry
17 | gcloud builds submit --region=europe-west1 --tag europe-west1-docker.pkg.dev/portal-385519/phospho-backend/ai-chat-bubble:latest
18 | 
19 | # Read the .env file and export the variables
20 | set -a && source .env && set +a
21 | 
22 | # Deploy the image to GCP Cloud Run
23 | gcloud run deploy ai-chat-bubble \
24 |  --image=europe-west1-docker.pkg.dev/portal-385519/phospho-backend/ai-chat-bubble:latest \
25 |  --region=europe-west1 \
26 |  --allow-unauthenticated \
27 |  --set-env-vars URL=$URL,PHOSPHO_API_KEY=$PHOSPHO_API_KEY,PHOSPHO_PROJECT_ID=$PHOSPHO_PROJECT_ID \
28 |  --set-env-vars QDRANT_API_KEY=$QDRANT_API_KEY,QDRANT_LOCATION=$QDRANT_LOCATION,ORIGIN=$ORIGIN,MISTRAL_API_KEY=$MISTRAL_API_KEY \
29 |  --memory=1Gi


--------------------------------------------------------------------------------
/app/scraper/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | from scrapy.crawler import CrawlerProcess  # type: ignore
 4 | from scrapy.utils.project import get_project_settings  # type: ignore
 5 | from scraper.spiders.spider import TextContentSpider  # type: ignore
 6 | 
 7 | 
 8 | class ScraperInterface:
 9 |     """
10 |     scraper logic:
11 |     - scrapy project url LinkExtractor (basically a url follower, it will find all the urls in a page and then follow them)
12 |     - export all the content to a json exporter, it will export the scraped data to a json file)
13 |     - for the json format, check @json_format.py
14 |     """
15 | 
16 |     def __init__(self, domain, depth):
17 |         """
18 |         Initialize the ScraperInterface with domain and depth.
19 | 
20 |         :param domain: The domain to scrape.
21 |         :param depth: The depth of the crawl.
22 |         """
23 |         self.domain = domain
24 |         self.depth = depth
25 |         self.output_path = f"data/{self.domain}.json"
26 |         self.spider_db = os.path.join(os.getcwd(), "data")
27 | 
28 |     def run_crawler(self):
29 |         """
30 |         Run the Scrapy crawler to scrape the website.
31 |         """
32 |         start_time = time.time()
33 |         process = CrawlerProcess(get_project_settings())
34 |         process.crawl(
35 |             TextContentSpider,
36 |             domain=self.domain,
37 |             depth=self.depth,
38 |             output_path=self.output_path,
39 |             db_path=self.spider_db,
40 |         )
41 |         process.start()  # Start the reactor and perform all crawls
42 |         end_time = time.time()
43 |         print(f"Time taken: {end_time - start_time} seconds")
44 | 


--------------------------------------------------------------------------------
/app/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Project Specific
  4 | data/*
  5 | .DS_Store
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | cover/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | db.sqlite3-journal
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | .pybuilder/
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # IPython
 88 | profile_default/
 89 | ipython_config.py
 90 | 
 91 | # pyenv
 92 | #   For a library or package, you might want to ignore these files since the code is
 93 | #   intended to run in multiple environments; otherwise, check them in:
 94 | # .python-version
 95 | 
 96 | # pipenv
 97 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 98 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 99 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | #   install all needed dependencies.
101 | #Pipfile.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/#use-with-ide
116 | .pdm.toml
117 | 
118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
119 | __pypackages__/
120 | 
121 | # Celery stuff
122 | celerybeat-schedule
123 | celerybeat.pid
124 | 
125 | # SageMath parsed files
126 | *.sage.py
127 | 
128 | # Environments
129 | .env
130 | .venv
131 | env/
132 | venv/
133 | ENV/
134 | env.bak/
135 | venv.bak/
136 | 
137 | # Spyder project settings
138 | .spyderproject
139 | .spyproject
140 | 
141 | # Rope project settings
142 | .ropeproject
143 | 
144 | # mkdocs documentation
145 | /site
146 | 
147 | # mypy
148 | .mypy_cache/
149 | .dmypy.json
150 | dmypy.json
151 | 
152 | # Pyre type checker
153 | .pyre/
154 | 
155 | # pytype static type analyzer
156 | .pytype/
157 | 
158 | # Cython debug symbols
159 | cython_debug/
160 | 
161 | # PyCharm
162 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
165 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
166 | #.idea/
167 | 
168 | node_modules/
169 | 
170 | .env
171 | notebooks/


--------------------------------------------------------------------------------
/app/scraper/middlewares.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your spider middleware
  2 | #
  3 | # See documentation in:
  4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 | 
  6 | from scrapy import signals
  7 | 
  8 | # useful for handling different item types with a single interface
  9 | from itemadapter import is_item, ItemAdapter
 10 | 
 11 | 
 12 | class Scraper1SpiderMiddleware:
 13 |     # Not all methods need to be defined. If a method is not defined,
 14 |     # scrapy acts as if the spider middleware does not modify the
 15 |     # passed objects.
 16 | 
 17 |     @classmethod
 18 |     def from_crawler(cls, crawler):
 19 |         # This method is used by Scrapy to create your spiders.
 20 |         s = cls()
 21 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 22 |         return s
 23 | 
 24 |     def process_spider_input(self, response, spider):
 25 |         # Called for each response that goes through the spider
 26 |         # middleware and into the spider.
 27 | 
 28 |         # Should return None or raise an exception.
 29 |         return None
 30 | 
 31 |     def process_spider_output(self, response, result, spider):
 32 |         # Called with the results returned from the Spider, after
 33 |         # it has processed the response.
 34 | 
 35 |         # Must return an iterable of Request, or item objects.
 36 |         for i in result:
 37 |             yield i
 38 | 
 39 |     def process_spider_exception(self, response, exception, spider):
 40 |         # Called when a spider or process_spider_input() method
 41 |         # (from other spider middleware) raises an exception.
 42 | 
 43 |         # Should return either None or an iterable of Request or item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info("Spider opened: %s" % spider.name)
 57 | 
 58 | 
 59 | class Scraper1DownloaderMiddleware:
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info("Spider opened: %s" % spider.name)
104 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AI chat bubble - custom AI assistant connected to your knowledge
  2 | 
  3 | **Simple and fast AI chat bubble for your HTML website.** The AI assistant can answer questions about a website's content using RAG, streaming, and the Mistral model. Compatible with **React** and **Wordpress**!
  4 | 
  5 | **How does it work ?**
  6 | 
  7 | 1. Run the backend to create an assistant with knowledge about your website's content
  8 | 2. Add a code snippet to your HTML frontend
  9 | 3. Your users can now chat with an assistant in an AI chat bubble!
 10 | 
 11 | **Production-ready**
 12 | 
 13 | You can host the AI chat bubble on your own machine with a simple `docker-compose up --build`.
 14 | See what users are asking thanks to [phospho analytics](https://phospho.ai) already integrated.
 15 | 
 16 | ![ai chat bubble](https://github.com/user-attachments/assets/32a5172a-017e-41ac-a59b-c9940e541380)
 17 | 
 18 | ## Quickstart
 19 | 
 20 | ### 1. Setup .env
 21 | 
 22 | Clone this repository.
 23 | 
 24 | ```bash
 25 | # clone using the web url
 26 | git clone https://github.com/phospho-app/ai-chat-bubble.git
 27 | ```
 28 | 
 29 | Then, create a `.env` file at the root with this content:
 30 | 
 31 | ```bash
 32 | URL=https://www.example.com # Your assistant will know everything about this URL
 33 | 
 34 | # To add:
 35 | MISTRAL_API_KEY=...
 36 | PHOSPHO_API_KEY=...
 37 | PHOSPHO_PROJECT_ID=...
 38 | ```
 39 | 
 40 | In `URL`, put the website with the relevant content you want the AI assistant to know about.
 41 | The assistant will crawl domains with a depth of 3 (this is customizable).
 42 | 
 43 | #### External services
 44 | 
 45 | - **LLM:** We use the Mistral AI model - _mistral-large-latest_. Get your `MISTRAL_API_KEY` [here](https://mistral.ai).
 46 | - **Analytics:** Messages are logged to phospho. Get your `PHOSPHO_API_KEY` and your `PHOSPHO_PROJECT_ID` [here](https://platform.phospho.ai).
 47 | 
 48 | ### 2. Run the assistant backend
 49 | 
 50 | To deploy the backend of the AI chat bubble, this repository uses [docker compose](https://docs.docker.com/compose/). [Follow this guide to install docker compose](https://docs.docker.com/compose/install/), then run the assistant's backend:
 51 | 
 52 | ```bash
 53 | cd ai-chat-bubble # the name of the clone repo
 54 | docker-compose up --build
 55 | ```
 56 | 
 57 | Questions are sent to the assistant using the POST API endpoint `/question_on_url`. This returns a streamable response. Go to [localhost:8080/docs](localhost:8080/docs) for more details.
 58 | 
 59 | ### 3. Add the chat bubble to your website
 60 | 
 61 | Add the chat bubble to your website with this snippet in a HTML component:
 62 | 
 63 | ```html
 64 | <script src="http://localhost:8080/component/chat-bubble.js" async></script>
 65 | ```
 66 | 
 67 | If you just wan to test your assistant, you simply need to open the `demo.html` file in your browser.
 68 | 
 69 | Look into advanced configuration to change its style.
 70 | 
 71 | ## Advanced configuration
 72 | 
 73 | ### Change the chat bubble UI
 74 | 
 75 | The file `component/chat-bubble.js` contains the AI chat bubble style. It is served as a static file and is the compiled version of `interface/chat-bubble.js`.
 76 | 
 77 | To change the AI chat bubble, edit the `interface/chat-bubble.js` and then run `npx webpack` in the folder _app_ of the repo.
 78 | 
 79 | ### CORS policy
 80 | 
 81 | In production, it's best to setup a restrictive CORS policy to allow only your frontend to call your AI assistant backend. To do this, add an `ORIGINS` list in your `.env`.
 82 | 
 83 | ```
 84 | ORIGINS = ["http://localhost:3000", "http://localhost:3001"]
 85 | ```
 86 | 
 87 | _Only urls in `ORIGINS` can access the `/question_on_url` endpoint._
 88 | 
 89 | ### Edit ports
 90 | 
 91 | The docker runs the main app on port _8080_. To change it, add a `SERVER_URL` field in your `.env`.
 92 | 
 93 | ```
 94 | SERVER_URL=your_new_port
 95 | ```
 96 | 
 97 | Then change the source of the interface script: `<script src="your_new_port/component/chat-bubble.js" async />`
 98 | 
 99 | ### Prompts, AI, vector databases
100 | 
101 | The AI assistant of the AI chat bubble uses [Llama Index](https://docs.llamaindex.ai/en/stable/), [Qdrant](https://qdrant.tech/documentation/), and [Mistral](https://docs.mistral.ai). This behaviour is implemented in `models.py`.
102 | 
103 | - Edit `ChatMistral` to change the prompts or models
104 | - Edit the `EmbeddingsVS` client to use another Vector store than Qdrant
105 | 
106 | ## About
107 | 
108 | Made by juniors for juniors in PARIS - phospho team 🥖🇫🇷
109 | 
110 | Special thanks to @flamschou, @fred3105, and @oulianov 🧪💚
111 | 


--------------------------------------------------------------------------------
/app/scraper/settings.py:
--------------------------------------------------------------------------------
  1 | # Scrapy settings for scraper project
  2 | #
  3 | # For simplicity, this file contains only settings considered important or
  4 | # commonly used. You can find more settings consulting the documentation:
  5 | #
  6 | #     https://s.scrapy.org/en/latest/topics/settings.html
  7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  9 | 
 10 | BOT_NAME = "scraper"
 11 | 
 12 | SPIDER_MODULES = ["scraper.spiders"]
 13 | NEWSPIDER_MODULE = "scraper.spiders"
 14 | 
 15 | 
 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 17 | # USER_AGENT = "scraper (+http://www.yourdomain.com)"
 18 | 
 19 | # Obey robots.txt rules
 20 | ROBOTSTXT_OBEY = True
 21 | 
 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 23 | # CONCURRENT_REQUESTS = 32
 24 | 
 25 | # Configure a delay for requests for the same website (default: 0)
 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 27 | # See also autothrottle settings and docs
 28 | # DOWNLOAD_DELAY = 3
 29 | # The download delay setting will honor only one of:
 30 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 31 | # CONCURRENT_REQUESTS_PER_IP = 16
 32 | 
 33 | # Disable cookies (enabled by default)
 34 | # COOKIES_ENABLED = False
 35 | 
 36 | # Disable Telnet Console (enabled by default)
 37 | # TELNETCONSOLE_ENABLED = False
 38 | 
 39 | # Override the default request headers:
 40 | # DEFAULT_REQUEST_HEADERS = {
 41 | #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 42 | #    "Accept-Language": "en",
 43 | # }
 44 | 
 45 | # Enable or disable spider middlewares
 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 47 | # SPIDER_MIDDLEWARES = {
 48 | #    "scraper.middlewares.Scraper1SpiderMiddleware": 543,
 49 | # }
 50 | 
 51 | # Enable or disable downloader middlewares
 52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 53 | # DOWNLOADER_MIDDLEWARES = {
 54 | #    "scraper.middlewares.Scraper1DownloaderMiddleware": 543,
 55 | # }
 56 | 
 57 | # Enable or disable extensions
 58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
 59 | # EXTENSIONS = {
 60 | #    "scrapy.extensions.telnet.TelnetConsole": None,
 61 | # }
 62 | 
 63 | # Configure item pipelines
 64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 65 | # ITEM_PIPELINES = {
 66 | #    "scraper.pipelines.Scraper1Pipeline": 300,
 67 | # }
 68 | 
 69 | # Enable and configure the AutoThrottle extension (disabled by default)
 70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 71 | # AUTOTHROTTLE_ENABLED = True
 72 | # The initial download delay
 73 | # AUTOTHROTTLE_START_DELAY = 5
 74 | # The maximum download delay to be set in case of high latencies
 75 | # AUTOTHROTTLE_MAX_DELAY = 60
 76 | # The average number of requests Scrapy should be sending in parallel to
 77 | # each remote server
 78 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 79 | # Enable showing throttling stats for every response received:
 80 | # AUTOTHROTTLE_DEBUG = False
 81 | 
 82 | # Enable and configure HTTP caching (disabled by default)
 83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 84 | # HTTPCACHE_ENABLED = True
 85 | # HTTPCACHE_EXPIRATION_SECS = 0
 86 | # HTTPCACHE_DIR = "httpcache"
 87 | # HTTPCACHE_IGNORE_HTTP_CODES = []
 88 | # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 89 | 
 90 | # Set settings whose default value is deprecated to a future-proof value
 91 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 92 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 93 | FEED_EXPORT_ENCODING = "utf-8"
 94 | 
 95 | 
 96 | # --- settings config for retry middleware ---
 97 | # Enable or ensure the RetryMiddleware is enabled
 98 | RETRY_ENABLED = True
 99 | 
100 | # Include 429 in the retryable status codes
101 | RETRY_HTTP_CODES = [429, 500, 502, 503, 504, 408, 400]
102 | 
103 | # Set the maximum number of retry attempts
104 | RETRY_TIMES = 5  # This can be adjusted based on your needs
105 | 
106 | # Set the delay between retries
107 | RETRY_PRIORITY_ADJUST = (
108 |     -1
109 | )  # Adjust priority of retries to be higher than other requests
110 | DOWNLOAD_DELAY = 1  # Delay between requests to the same domain
111 | 
112 | CONCURRENT_REQUESTS = (
113 |     32  # or higher, depending on your bandwidth and the server's capacity
114 | )
115 | COOKIES_ENABLED = False
116 | REDIRECT_MAX_TIMES = 3
117 | 


--------------------------------------------------------------------------------
/app/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | from dotenv import load_dotenv
  5 | from fastapi import FastAPI, HTTPException
  6 | from fastapi.responses import FileResponse, StreamingResponse
  7 | from fastapi_simple_rate_limiter import rate_limiter
  8 | 
  9 | sys.path.append(os.path.abspath(os.path.dirname(__file__)))
 10 | 
 11 | from models import QuestionOnUrlRequest
 12 | from typing import Dict, Optional
 13 | from urllib.parse import urlparse
 14 | from models import MainExecute
 15 | from contextlib import asynccontextmanager
 16 | from fastapi.middleware.cors import CORSMiddleware
 17 | from loguru import logger
 18 | 
 19 | load_dotenv()
 20 | 
 21 | # Check that the environment variables are set
 22 | assert os.getenv("MISTRAL_API_KEY"), "MISTRAL_API_KEY environment variable not set"
 23 | assert os.getenv("URL"), "URL environment variable not set"
 24 | 
 25 | URL = os.getenv("URL")
 26 | DOMAIN_STATUS_FILE = "domain_status.json"
 27 | DATA_FOLDER = "data"
 28 | ORIGINS = os.getenv("ORIGINS", ["*"])
 29 | # Parse the string into an array. Not needed if using load_dotenv
 30 | if isinstance(ORIGINS, str):
 31 |     ORIGINS = eval(ORIGINS)
 32 | SERVER_URL = os.getenv("SERVER_URL", "http://localhost:8080")
 33 | 
 34 | host, port = urlparse(SERVER_URL).netloc.split(":")
 35 | 
 36 | 
 37 | # Dictionary to store the status of each domain
 38 | domain_status: Dict[str, str] = {}
 39 | 
 40 | # Dictionary to store MainExecute instances for each domain
 41 | domain_instances: Dict[str, MainExecute] = {}
 42 | 
 43 | 
 44 | def load_domain_status():
 45 |     if os.path.exists(DOMAIN_STATUS_FILE):
 46 |         with open(DOMAIN_STATUS_FILE, "r") as f:
 47 |             return json.load(f)
 48 |     else:
 49 |         return create_json_file()
 50 | 
 51 | 
 52 | def create_json_file():
 53 |     if os.path.exists(DATA_FOLDER):
 54 |         for filename in os.listdir(DATA_FOLDER):
 55 |             if filename.endswith(".json"):
 56 |                 domain = filename[:-5]  # Remove the .json extension
 57 |                 domain_status[domain] = "completed"
 58 |     save_domain_status()
 59 |     return domain_status
 60 | 
 61 | 
 62 | def save_domain_status():
 63 |     with open(DOMAIN_STATUS_FILE, "w") as f:
 64 |         json.dump(domain_status, f)
 65 | 
 66 | 
 67 | def initialize_domains():
 68 |     global domain_status, domain_instances
 69 |     domain_status = load_domain_status()
 70 |     print(f"Loaded domain status: {domain_status}")
 71 |     for domain, status in domain_status.items():
 72 |         print(f"Initializing domain: {domain}")
 73 |         if status == "completed":
 74 |             try:
 75 |                 main_execute = MainExecute(domain, load=False)
 76 |                 domain_instances[domain] = main_execute
 77 |             except Exception as e:
 78 |                 logger.error(f"Failed to initialize domain {domain}: {str(e)}")
 79 |                 domain_status[domain] = f"failed: {str(e)}"
 80 |                 save_domain_status()
 81 | 
 82 | 
 83 | def submit_url(url: Optional[str]):
 84 |     if url is None:
 85 |         raise HTTPException(status_code=400, detail="URL not set")
 86 |     domain = urlparse(url).netloc
 87 |     if domain not in domain_status or domain_status[domain] not in [
 88 |         "queued",
 89 |         "processing",
 90 |         "completed",
 91 |     ]:
 92 |         domain_status[domain] = "queued"
 93 |         save_domain_status()
 94 |         logger.info(f"Submitting domain: {domain}")
 95 |         try:
 96 |             process_domain(domain)
 97 |             logger.info(f"{domain} indexation completed")
 98 |         except Exception as e:
 99 |             logger.error(f"Failed to process domain {domain}: {str(e)}")
100 |             domain_status[domain] = f"failed: {str(e)}"
101 |             save_domain_status()
102 |     else:
103 |         logger.info(f"Domain {domain} already being processed")
104 | 
105 | 
106 | @asynccontextmanager
107 | async def lifespan(app: FastAPI):
108 |     # Startup: You can add initialization code here
109 |     logger.info("Starting the application")
110 | 
111 |     initialize_domains()
112 | 
113 |     logger.info(f"domains: {domain_status.keys()}")
114 | 
115 |     logger.info(f"URL: {URL}")
116 | 
117 |     logger.info(f"Server URL: {SERVER_URL}")
118 | 
119 |     yield  # Here the FastAPI application runs
120 | 
121 |     # Shutdown: You can add cleanup code here if needed
122 |     print("Shutting down the application")
123 | 
124 | 
125 | app = FastAPI(lifespan=lifespan)
126 | 
127 | # Add CORS middleware
128 | app.add_middleware(
129 |     CORSMiddleware,
130 |     allow_origins=ORIGINS
131 |     if ORIGINS is not None
132 |     else [],  # Specifies the origins allowed to access this API
133 |     allow_credentials=True,
134 |     allow_methods=["*"],  # Allows all methods (POST, GET, etc.)
135 |     allow_headers=["*"],  # Allows all headers
136 | )
137 | 
138 | 
139 | @rate_limiter(limit=3, seconds=60)
140 | @app.get("/")
141 | async def health_check():
142 |     return {"status": "ok"}
143 | 
144 | 
145 | @rate_limiter(limit=3, seconds=60)
146 | # Serve static files
147 | @app.get("/static/chat-bubble.js")
148 | async def serve_component_file():
149 |     file_path = os.path.join("static", "chat-bubble.js")
150 |     if os.path.exists(file_path):
151 |         return FileResponse(file_path)
152 |     return {"error": "File not found"}, 404
153 | 
154 | 
155 | def process_domain(domain: str):
156 |     domain_folder = os.path.join("data", domain)
157 |     os.makedirs(domain_folder, exist_ok=True)
158 |     logger.info(f"Indexing domain: {domain}")
159 |     try:
160 |         domain_status[domain] = "processing"
161 |         save_domain_status()
162 |         main_execute = MainExecute(domain)
163 |         domain_instances[domain] = main_execute
164 |         domain_status[domain] = "completed"
165 |         save_domain_status()
166 |     except Exception as e:
167 |         domain_status[domain] = f"failed: {str(e)}"
168 |         save_domain_status()
169 | 
170 | 
171 | @rate_limiter(limit=2, seconds=5)
172 | @app.post("/question_on_url")
173 | async def question_on_url(request: QuestionOnUrlRequest):
174 |     if URL is None:
175 |         raise HTTPException(status_code=400, detail="URL not set")
176 |     url = URL
177 |     logger.debug(f"Question on URL: {url}")
178 |     question = request.question
179 |     domain = urlparse(url).netloc
180 | 
181 |     logger.debug(f"Domain: {domain}")
182 |     logger.debug(f"Domains: {domain_instances.keys()}")
183 | 
184 |     if domain not in domain_instances:
185 |         raise HTTPException(status_code=400, detail="Domain not processed yet")
186 | 
187 |     if domain_status.get(domain) != "completed":
188 |         raise HTTPException(status_code=400, detail="Domain processing not completed")
189 | 
190 |     logger.debug(f"Domains: {domain_instances.keys()}")
191 |     main_execute = domain_instances[domain]
192 | 
193 |     return StreamingResponse(main_execute.ask(question), media_type="text/plain")
194 | 
195 | 
196 | if __name__ == "__main__":
197 |     initialize_domains()
198 |     submit_url(URL)
199 | 
200 |     import uvicorn
201 | 
202 |     uvicorn.run("main:app", host="0.0.0.0", port=int(port), reload=True)
203 | 


--------------------------------------------------------------------------------
/app/static/chat-bubble.js:
--------------------------------------------------------------------------------
1 | (()=>{const n=document.createElement("style");n.textContent="\n* {\n    box-sizing: border-box;\n    margin: 0;\n    padding: 0;\n    font-family: 'Inter', sans-serif; /* Using a modern sans-serif font like Inter */\n}\n\n/* Chat Bubble (Floating button) */\n.chat-bubble {\n    position: fixed;\n    bottom: 20px;\n    right: 20px;\n    width: 60px;\n    height: 60px;\n    background: #3e8ef7;\n    border-radius: 50%;\n    cursor: pointer;\n    box-shadow: 0 2px 12px rgba(0, 0, 0, 0.1);\n    display: flex;\n    align-items: center;\n    justify-content: center;\n    transition: all 0.3s ease;\n    z-index: 1000;\n}\n\n.chat-bubble:hover {\n    transform: scale(1.1);\n}\n\n.chat-bubble-icon {\n    font-size: 28px;\n}\n\n/* Chat Window Modal */\n.chat-window {\n    position: fixed;\n    bottom: 90px;\n    right: 20px;\n    width: 350px;\n    height: 450px;\n    background: #1e1e1e;\n    border-radius: 16px;\n    box-shadow: 0 12px 40px rgba(0, 0, 0, 0.1);\n    display: none;\n    flex-direction: column;\n    z-index: 1000;\n    opacity: 0;\n    transform: translateY(30px);\n    transition: opacity 0.3s ease, transform 0.3s ease;\n}\n\n.chat-window.active {\n    display: flex;\n    opacity: 1;\n    transform: translateY(0);\n    background: #1e1e1e;\n}\n\n/* Header */\n.chat-header {\n    padding: 16px;\n    background: #1e1e1e;\n    color: white;\n    font-size: 16px;\n    font-weight: 600;\n    display: flex;\n    justify-content: space-between;\n    align-items: center;\n    border-top-left-radius: 16px;\n    align-self: flex-end;\n    border-top-right-radius: 16px;\n    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);\n}\n\n.close-button {\n    background: none;\n    border: none;\n    color: white;\n    cursor: pointer;\n    font-size: 18px;\n    transition: color 0.3s ease;\n}\n\n.close-button:hover {\n    color: #3e8ef7;\n}\n\n/* Chat Messages Section */\n.chat-messages {\n    flex-grow: 1;\n    padding: 20px;\n    overflow-y: auto;\n    display: flex;\n    flex-direction: column;\n    gap: 14px;\n    background: #1e1e1e;\n}\n\n.message {\n    padding: 12px 18px;\n    border-radius: 20px;\n    max-width: 80%;\n    word-wrap: break-word;\n    font-size: 14px;\n    line-height: 1.4;\n}\n\n.message.sent {\n    background: #2A2A2A;\n    color: white;\n    align-self: flex-end;\n}\n\n.message.received {\n    background: #3e8ef7;\n    color: white;\n    align-self: flex-start;\n}\n\n/* Chat Input Section */\n.chat-input {\n    display: flex;\n    padding: 16px;\n    gap: 12px;\n    background: #1e1e1e;\n}\n\n.message-input {\n    flex-grow: 1;\n    padding: 12px 18px;\n    border: 1px solid #e0e0e0;\n    border-radius: 30px;\n    outline: none;\n    font-size: 14px;\n    transition: border-color 0.3s ease;\n}\n\n.message-input:focus {\n    border-color: #3e8ef7;\n}\n\n.send-button {\n    background: #3e8ef7;\n    color: white;\n    border: none;\n    border-radius: 30px;\n    padding: 0px 16px;\n    cursor: pointer;\n    transition: background 0.3s ease;\n    font-size: 24px;\n    align-items: center;\n}\n\n.send-button:hover {\n    background: #3378d1;\n}\n\n.send-button:active {\n    background: #2566a0;\n}\n\n\n.phospho-typing-indicator {\n  padding: 10px;\n  display: flex;\n  align-items: center;\n}\n\n.typing-dots {\n  display: flex;\n  align-self: flex-start;\n}\n\n.typing-dots span {\n  height: 8px;\n  width: 8px;\n  margin: 0 4px;\n  background-color: #3e8ef7;\n  display: block;\n  border-radius: 50%;\n  opacity: 0.4;\n  animation: typing 1s infinite ease-in-out;\n  align-self: flex-start;\n}\n\n.typing-dots span:nth-child(1) {\n  animation-delay: 0.1s;\n}\n\n.typing-dots span:nth-child(2) {\n  animation-delay: 0.2s;\n}\n\n.typing-dots span:nth-child(3) {\n  animation-delay: 0.3s;\n}\n\n.highlighted-link {\n  text-decoration: underline;\n  color: rgb(255, 255, 255);\n}\n\n/* Copy button styling */\n.copy-btn {\n  position: absolute;\n  top: 5px;\n  right: 5px;\n  background-color: #444;\n  color: #e6e6e6;\n  border: none;\n  border-radius: 3px;\n  padding: 2px 5px;\n  font-size: 12px;\n  cursor: pointer;\n}\n\n.copy-btn:hover {\n  background-color: #555;\n}\n\n\n/* Code block styling */\npre {\n  background-color: #2b2b2b;\n  border: 1px solid #444;\n  border-radius: 4px;\n  padding: 10px;\n  overflow-x: auto;\n  position: relative;\n}\n\ncode {\n  font-family: 'Courier New', Courier, monospace;\n  font-size: 14px;\n  color: #2b2b2b;\n}\n\n/* Inline code styling */\np code {\n  background-color: #2c2f33;\n  color: #e6e6e6;\n  padding: 2px 4px;\n  border-radius: 3px;\n}\n\n/* Fade In Animation */\n.chat-window.fade-in {\n    animation: fadeIn 0.3s forwards;\n}\n\n@media (max-width: 767px) {\n  .chat-window {\n    width: 100%;\n    height: 100%;\n    border-radius: 0;\n    bottom: 0;\n    right: 0;\n  }\n\n  .chat-messages {\n    max-height: 70%;\n  }\n\n  .chat-input {\n    flex-wrap: wrap;\n  }\n\n  .message-input {\n    flex-grow: 1;\n    width: 100%;\n  }\n\n  .send-button {\n    margin-top: 12px;\n    font-size: 40px;\n  }\n}\n\n@keyframes typing {\n  0% {\n    transform: translateY(0px);\n    background-color: #0000ff;\n  }\n  28% {\n    transform: translateY(-7px);\n    background-color: #3e8ef7;\n  }\n  44% {\n    transform: translateY(0px);\n    background-color: #0000ff;\n  }\n}\n\n@keyframes fadeIn {\n    from {\n        opacity: 0;\n        transform: translateY(30px);\n    }\n    to {\n        opacity: 1;\n        transform: translateY(0);\n    }\n}",document.head.appendChild(n);const e=document.createElement("div");e.className="chat-bubble",e.innerHTML='<span class="chat-bubble-icon">💬</span>',document.body.appendChild(e);const t=document.createElement("div");t.className="chat-window",t.innerHTML='\n  <div class="chat-header">\n    <button class="close-button"></button>\n  </div>\n  <div class="chat-messages"></div>\n  <div class="chat-input">\n    <input type="text" class="message-input" placeholder="Type a message...">\n    <button class="send-button">↑</button>\n  </div>\n',document.body.appendChild(t);const o=t.querySelector(".close-button"),a=t.querySelector(".chat-messages"),r=t.querySelector(".message-input"),i=t.querySelector(".send-button");let s=!1;function c(){s=!1,e.querySelector(".chat-bubble-icon").textContent=s?"⌑":"💬",t.classList.remove("active")}function l(n,e=!0){const t=document.createElement("div");t.className="message "+(e?"sent":"received"),t.textContent=n,a.appendChild(t),a.scrollTop=a.scrollHeight}function d(){const n=a.querySelector(".typing-indicator");n&&n.remove()}async function p(){const n=r.value.trim();if(n){l(n,!0),r.value="",function(){d();const n=document.createElement("div");n.className="message bot typing-indicator",n.innerHTML='<div class="typing-dots"><span></span><span></span><span></span></div>',a.appendChild(n),a.scrollTop=a.scrollHeight}();try{const e=await fetch("http://localhost:8080/question_on_url",{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({question:n})});if(e.ok)if(e.body instanceof ReadableStream){const n=e.body.getReader(),t=new TextDecoder;let o="";const r=async()=>{const{done:e,value:i}=await n.read();e?d():(o+=t.decode(i,{stream:!0}),console.log("receivedText",o),function(n){d();let e=a.querySelector(".message.received:last-child");e||(e=document.createElement("div"),e.className="message received",e.innerHTML="<p></p>",a.appendChild(e));const t=function(n){return(n=(n=(n=n.replace(/```(\w+)?\n([\s\S]*?)```/g,((n,e,t)=>{return`<pre><code class="language-${e||""}">${o=t.trim(),o.replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;").replace(/"/g,"&quot;").replace(/'/g,"&#039;")}</code></pre>`;var o}))).replace(/`([^`]+)`/g,"<code>$1</code>")).replace(/\$\$([\s\S]*?)\$\$/g,((n,e)=>`<div class="latex-container">${e}</div>`))).replace(/^### (.*$)/gim,"<h3>$1</h3>").replace(/^## (.*$)/gim,"<h2>$1</h2>").replace(/^# (.*$)/gim,"<h1>$1</h1>").replace(/\*\*(.*?)\*\*/g,"<strong>$1</strong>").replace(/\*(.*?)\*/g,"<em>$1</em>").replace(/^\s*\n\*/gm,"<ul>\n*").replace(/^(\*\s.*)\n([^\*])/gm,"$1\n</ul>\n\n$2").replace(/^\*\s(.*)/gm,"<li>$1</li>").replace(/\[([^\]]+)\]\(([^\)]+)\)/g,'<a href="$2" class="highlighted-link" target="_blank">$1</a>').replace(/^\s*(\n)?(.+)/gm,(function(n){return/\<(\/)?(h\d|ul|ol|li|blockquote|pre|img)/.test(n)?n:"<p>"+n+"</p>"}))}(n);e.querySelector("p").innerHTML=t,e.querySelectorAll("pre").forEach((n=>{if(!n.querySelector(".copy-btn")){const e=document.createElement("button");e.textContent="Copy",e.className="copy-btn",e.addEventListener("click",(()=>{const t=n.querySelector("code").textContent;navigator.clipboard.writeText(t).then((()=>{e.textContent="Copied!",setTimeout((()=>{e.textContent="Copy"}),2e3)}))})),n.appendChild(e)}})),a.scrollTop=a.scrollHeight}(o),r())};await r()}else d(),console.error("Error: Response is not a StreamingResponse");else d(),console.error("Error with the request:",e.statusText)}catch(n){d(),console.error("Error:",n)}}}l("Hello! How can I assist you today?",!1),e.addEventListener("click",(function(){s=!s,t.classList.toggle("active",s),e.querySelector(".chat-bubble-icon").textContent=s?"⌑":"💬",s&&r.focus()})),o.addEventListener("click",c),i.addEventListener("click",p),r.addEventListener("keypress",(n=>{"Enter"===n.key&&p()})),document.addEventListener("click",(n=>{!s||t.contains(n.target)||e.contains(n.target)||c()}))})();


--------------------------------------------------------------------------------
/app/scraper/spiders/spider.py:
--------------------------------------------------------------------------------
  1 | import scrapy
  2 | import json
  3 | from scrapy.linkextractors import LinkExtractor
  4 | from scrapy.spiders import CrawlSpider, Rule
  5 | from bs4 import BeautifulSoup
  6 | import re
  7 | import datetime
  8 | import hashlib
  9 | import os
 10 | import pandas
 11 | import uuid
 12 | from llama_index.embeddings.mistralai import MistralAIEmbedding
 13 | 
 14 | 
 15 | class TextContentSpider(CrawlSpider):
 16 |     name = "crawler"
 17 |     rules = (Rule(LinkExtractor(allow=()), callback="parse_response", follow=True),)
 18 | 
 19 |     def __init__(
 20 |         self,
 21 |         domain: str = "",
 22 |         depth: int = 1,
 23 |         db_path: str = "../data",
 24 |         *args,
 25 |         **kwargs,
 26 |     ):
 27 |         super(TextContentSpider, self).__init__(*args, **kwargs)
 28 |         self.allowed_domains = [domain]
 29 |         self.start_urls = [f"https://{domain}/"]
 30 |         self.depth_limit = int(depth)
 31 |         self.results = []
 32 |         self.status_counts = {}
 33 |         self.chunk_size = 1024
 34 |         self.db_path = db_path
 35 |         self.db_file = os.path.join(self.db_path, f"{domain}.json")
 36 |         self.embeddings_model = MistralAIEmbedding(
 37 |             api_key=os.getenv("MISTRAL_API_KEY"),
 38 |             model_name="mistral-embed",
 39 |         )
 40 |         self.embeddings_model_name = "mistral-embed"
 41 | 
 42 |         # browser config
 43 |         self.browser_headless = False
 44 | 
 45 |         self.load_database()
 46 | 
 47 |     def update_database(self):
 48 |         with open(self.db_file, "r") as file:
 49 |             db_data = json.load(file)
 50 |         file.close()
 51 |         with open(self.db_file, "w") as file:
 52 |             db_data["data"] = self.database.to_dict(orient="records")
 53 |             self.database = pandas.DataFrame(data=db_data["data"])
 54 |             json.dump(db_data, file, indent=4)
 55 |         file.close()
 56 | 
 57 |     def start_requests(self):
 58 |         for url in self.start_urls:
 59 |             yield scrapy.Request(url, self.parse_response, meta={"depth": 0})
 60 | 
 61 |     def load_database(self):
 62 |         """
 63 |         here, it also depends if we want to take the chunk size in counts, i assume we just skip this
 64 |         """
 65 |         if os.path.exists(self.db_file):
 66 |             try:
 67 |                 self.logger.info(f"Loading database from {self.db_file}")
 68 |                 with open(self.db_file, "r") as file:
 69 |                     self.database = json.load(file)
 70 |                     self.database = pandas.DataFrame(data=self.database["data"])
 71 |                 file.close()
 72 |             except:
 73 |                 self.logger.info(f"Error loading database from {self.db_file}")
 74 |                 self.database = pandas.DataFrame()
 75 |         else:
 76 |             self.logger.info(f"No database found at {self.db_file}, creating new one.")
 77 |             self.database = pandas.DataFrame(
 78 |                 columns=[
 79 |                     "url",
 80 |                     "id",
 81 |                     "full_text",
 82 |                     "content_hash",
 83 |                     "chunked_text",
 84 |                     "last_time_crawled",
 85 |                     "status",
 86 |                 ]
 87 |             )
 88 |             headers_ = {
 89 |                 "url": [self.start_urls[0]],
 90 |                 "time": str(datetime.datetime.now()),
 91 |                 "config": {
 92 |                     "depth": self.depth_limit,
 93 |                     "chunk_size": self.chunk_size,
 94 |                     "embedding_model": self.embeddings_model_name,
 95 |                     "allowed_domains": [self.allowed_domains[0]],
 96 |                 },
 97 |                 "data": [],
 98 |             }
 99 |             with open(self.db_file, "w") as file:
100 |                 json.dump(headers_, file, indent=4)
101 |             file.close()
102 | 
103 |     def parse_text(self, text: str):
104 |         soup = BeautifulSoup(text, "html.parser")
105 |         for script_or_style in soup(["script", "style"]):
106 |             script_or_style.decompose()
107 |         clean_text = soup.get_text(separator=" ", strip=True)
108 |         lines = [line.strip() for line in clean_text.splitlines()]
109 |         cleaned_lines = [line for line in lines if line]
110 |         cleaned_text = " ".join(cleaned_lines)
111 | 
112 |         return cleaned_text
113 | 
114 |     def get_embeddings(self, text: str, url: str) -> dict:
115 |         chunks = self.chunk_text(text)
116 |         embeddings = []
117 |         embedeed_sentences = self.embeddings_model.encode(
118 |             chunks
119 |         ).tolist()  # Converted ndarray to list
120 |         embeddings = [
121 |             {
122 |                 "chunk_text": url + ": " + chunk,
123 |                 "embedding": embedeed_sentences[i],
124 |                 "id": str(uuid.uuid4()),
125 |             }
126 |             for i, chunk in enumerate(chunks)
127 |         ]
128 | 
129 |         return {"embeddings": embeddings}
130 | 
131 |     def chunk_text(self, text: str) -> list:
132 |         chunks = []
133 |         current_chunk = ""
134 |         sentences = re.split(r"(?<=[.!?]) +", text)
135 |         for sentence in sentences:
136 |             try:
137 |                 # f sentence in self.database['full_text'].values:
138 |                 if any(
139 |                     sentence in text
140 |                     for text in self.database["full_text"].values.tolist()
141 |                 ):
142 |                     self.logger.warning(
143 |                         f"Sentence already in database, skipping chunking."
144 |                     )
145 |                     continue
146 |             except Exception:
147 |                 pass
148 | 
149 |             if len(current_chunk + sentence) <= self.chunk_size:
150 |                 current_chunk += sentence + " "
151 |             else:
152 |                 if (
153 |                     current_chunk.strip()
154 |                 ):  # Check if the current chunk is not empty before appending
155 |                     chunks.append(current_chunk.strip())
156 |                 current_chunk = sentence + " "
157 | 
158 |         if (
159 |             current_chunk.strip()
160 |         ):  # Check if the last chunk is not empty before appending
161 |             chunks.append(current_chunk.strip())
162 | 
163 |         return chunks
164 | 
165 |     def parse_response(self, response):
166 |         if response.meta.get("depth", 0) > self.depth_limit:
167 |             self.logger.info(f"Reached depth limit for {response.url}")
168 |             return
169 |         if response.request.url != response.url:
170 |             self.logger.info(
171 |                 f"Redirected from {response.request.url} to {response.url}"
172 |             )
173 | 
174 |         status_code = str(response.status)
175 |         if status_code not in self.status_counts:
176 |             self.status_counts[status_code] = 0
177 |         self.status_counts[status_code] += 1
178 | 
179 |         processed_text = self.parse_text(response.xpath("//body").extract_first())
180 |         if len(processed_text) < 1:
181 |             self.logger.info(f"Page is empty, try rendering it")
182 |             return self.parse_response(response)
183 |         else:
184 |             self.logger.info(f"Page is not empty, continue")
185 | 
186 |         content_hash = hashlib.sha256(processed_text.encode("utf-8")).hexdigest()
187 |         try:
188 |             filtered_df = self.database[self.database["url"] == response.url]
189 |             url_entry = filtered_df.iloc[0] if not filtered_df.empty else None
190 |         except Exception as e:
191 |             self.logger.error(f"Error accessing database entry: {str(e)}")
192 |             url_entry = None
193 | 
194 |         # print(url_entry)
195 |         if url_entry is not None:
196 |             db_id = url_entry["id"]
197 |             last_time_crawled = url_entry["last_time_crawled"]
198 |             if url_entry["content_hash"] != content_hash:
199 |                 self.logger.info(
200 |                     f"Content hash mismatch for {response.url}, updating entry."
201 |                 )
202 |                 self.database.loc[
203 |                     self.database["url"] == response.url,
204 |                     [
205 |                         "full_text",
206 |                         "content_hash",
207 |                         "chunked_text",
208 |                         "last_time_crawled",
209 |                         "status",
210 |                     ],
211 |                 ] = [
212 |                     processed_text,
213 |                     content_hash,
214 |                     self.get_embeddings(processed_text, response.url),
215 |                     last_time_crawled,
216 |                     status_code,
217 |                 ]
218 |             else:
219 |                 self.logger.info(f"URL {response.url} is already in the database.")
220 |         else:
221 |             self.logger.info(f"New URL {response.url}, adding to database.")
222 |             new_entry = {
223 |                 "url": response.url,
224 |                 "id": str(uuid.uuid4()),
225 |                 "full_text": processed_text,
226 |                 "content_hash": content_hash,
227 |                 "chunked_text": self.get_embeddings(processed_text, response.url),
228 |                 "last_time_crawled": str(datetime.datetime.now()),
229 |                 "status": status_code,
230 |             }
231 |             self.database = pandas.concat(
232 |                 [self.database, pandas.DataFrame([new_entry])], ignore_index=True
233 |             )
234 | 
235 |             self.update_database()
236 | 
237 |         links = LinkExtractor(allow=()).extract_links(response)
238 |         for link in links:
239 |             current_depth = response.meta.get("depth", 0)
240 |             if current_depth < self.depth_limit:
241 |                 yield scrapy.Request(
242 |                     link.url,
243 |                     callback=self.parse_response,
244 |                     meta={"depth": current_depth + 1},
245 |                     errback=self.handle_error,
246 |                 )
247 | 
248 |     def handle_error(self, failure):
249 |         if failure.value.response.status == 429:
250 |             self.logger.error(
251 |                 f"Received 429 Too Many Requests from {failure.request.url}"
252 |             )
253 |             # Optionally, you can customize retry logic here
254 | 
255 |     def closed(self, reason):
256 |         # self.update_database()
257 |         with open(self.db_file, "w") as f:
258 |             output_data = {
259 |                 "url": self.start_urls,
260 |                 "time": str(datetime.datetime.now()),
261 |                 "config": {
262 |                     "depth": self.depth_limit,
263 |                     "chunk_size": self.chunk_size,
264 |                     "embedding_model": self.embeddings_model_name,
265 |                     "allowed_domains": self.allowed_domains,
266 |                 },
267 |                 "data": self.database.to_dict(orient="records"),
268 |             }
269 |             json.dump(output_data, f, indent=4)
270 |         self.logger.info(f"Closed spider with reason: {reason}")
271 |         self.logger.info(f"Total requests sent: {len(self.results)}")
272 |         self.logger.info(f"Status code counts: {self.status_counts}")
273 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2024 Phospho SAS
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/app/models.py:
--------------------------------------------------------------------------------
  1 | from loguru import logger
  2 | from scraper import (
  3 |     TextContentSpider,
  4 | )  # load the scraper from our scrapy project
  5 | from scrapy.crawler import CrawlerProcess  # type:ignore
  6 | from scrapy.utils.project import (  # type:ignore
  7 |     get_project_settings,
  8 | )
  9 | from qdrant_client import QdrantClient
 10 | from llama_index.vector_stores.qdrant import QdrantVectorStore
 11 | from llama_index.core import StorageContext
 12 | from llama_index.embeddings.mistralai import MistralAIEmbedding
 13 | from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
 14 | import time
 15 | import json
 16 | from mistralai import Mistral, AssistantMessage, ToolMessage
 17 | import os
 18 | import functools
 19 | from typing import Generator
 20 | from dotenv import load_dotenv
 21 | from typing import List
 22 | from pydantic import BaseModel
 23 | import phospho
 24 | 
 25 | load_dotenv()
 26 | 
 27 | phospho.init()
 28 | 
 29 | # Check that the environment variables are set
 30 | assert os.getenv("MISTRAL_API_KEY"), "MISTRAL_API_KEY environment variable not set"
 31 | 
 32 | 
 33 | class QuestionOnUrlRequest(BaseModel):
 34 |     question: str
 35 | 
 36 | 
 37 | class ScraperInterface:
 38 |     """
 39 |     scraper logic:
 40 |     - scrapy project url LinkExtractor (basically a url follower, it will find all the urls in a page and then follow them)
 41 |     - export all the content to a json exporter, it will export the scraped data to a json file)
 42 |     - for the json format, check @json_format.py
 43 |     """
 44 | 
 45 |     def __init__(self, domain, depth):
 46 |         """
 47 |         Initialize the ScraperInterface with domain and depth.
 48 | 
 49 |         :param domain: The domain to scrape.
 50 |         :param depth: The depth of the crawl.
 51 |         """
 52 |         self.domain = domain
 53 |         self.depth = depth
 54 |         self.output_path = os.path.join(os.getcwd(), "data", f"{domain}.json")
 55 |         self.spider_db = os.path.join(os.getcwd(), "data")
 56 | 
 57 |     def run_crawler(self):
 58 |         """
 59 |         Run the Scrapy crawler to scrape the website.
 60 |         """
 61 |         print("Running crawler")
 62 |         start_time = time.time()
 63 |         process = CrawlerProcess(get_project_settings())
 64 |         process.crawl(
 65 |             TextContentSpider,
 66 |             domain=self.domain,
 67 |             depth=self.depth,
 68 |             output_path=self.output_path,
 69 |             db_path=self.spider_db,
 70 |         )
 71 |         process.start()  # Start the reactor and perform all crawls
 72 |         end_time = time.time()
 73 |         logger.info(f"Time taken: {end_time - start_time} seconds")
 74 | 
 75 | 
 76 | class EmbeddingsVS:
 77 |     def __init__(self, domain):
 78 |         """
 79 |         Initialize the EmbeddingsVS with domain.
 80 |         it's just some init variable so that we can modify easily which model or parameters to use
 81 | 
 82 |         :param domain: The domain to create embeddings for.
 83 |         """
 84 |         self.vector_db_name = domain.replace(".", "_")
 85 |         self.domain = domain
 86 |         self.embed_model = MistralAIEmbedding(
 87 |             model_name="mistral-embed", api_key=os.getenv("MISTRAL_API_KEY")
 88 |         )
 89 | 
 90 |         if os.getenv("QDRANT_API_KEY") and os.getenv("QDRANT_LOCATION"):
 91 |             logger.info("Connecting to Qdrant cloud")
 92 |             try:
 93 |                 self.client = QdrantClient(
 94 |                     api_key=os.getenv("QDRANT_API_KEY"),
 95 |                     location=os.getenv("QDRANT_LOCATION"),
 96 |                 )
 97 |             except Exception as e:
 98 |                 logger.error(f"Failed to connect to Qdrant: {str(e)}")
 99 |                 self.client = None
100 |                 logger.error(f"QDRANT_API_KEY: {os.getenv('QDRANT_API_KEY')}")
101 |                 logger.error(f"QDRANT_LOCATION: {os.getenv('QDRANT_LOCATION')}")
102 |         else:
103 |             logger.info("Connecting to Qdrant local")
104 |             self.client = QdrantClient(
105 |                 # you can use :memory: mode for fast and light-weight experiments,
106 |                 # it does not require to have Qdrant deployed anywhere
107 |                 # but requires qdrant-client >= 1.1.1
108 |                 # location=":memory:"
109 |                 # otherwise set Qdrant instance address with:
110 |                 # url="http://<host>:<port>"
111 |                 # otherwise set Qdrant instance with host and port:
112 |                 host="qdrant",
113 |                 port=6333,
114 |                 # set API KEY for Qdrant Cloud
115 |                 # api_key=QDRANT_API_KEY,
116 |             )
117 |         self.scrapped_path = os.path.join(os.getcwd(), "data")
118 |         self.limit = 5
119 | 
120 |     def upload_embeddings(self):
121 |         """
122 |         Upload the embeddings to the Qdrant vector database.
123 |         """
124 |         try:
125 |             documents = SimpleDirectoryReader(self.scrapped_path).load_data()
126 | 
127 |             vector_store = QdrantVectorStore(
128 |                 client=self.client, collection_name=self.vector_db_name
129 |             )
130 |             storage_context = StorageContext.from_defaults(vector_store=vector_store)
131 |             index = VectorStoreIndex.from_documents(
132 |                 documents,
133 |                 storage_context=storage_context,
134 |                 embed_model=self.embed_model,
135 |             )
136 |             logger.info(
137 |                 f"Uploaded {len(documents)} documents to {self.vector_db_name} collection"
138 |             )
139 | 
140 |             return index
141 |         except Exception as e:
142 |             logger.error(f"Failed to upload embeddings: {str(e)}")
143 | 
144 |             raise e
145 | 
146 |     def search(self, query: str) -> List[dict]:
147 |         """
148 |         Search the vector database for the given query.
149 | 
150 |         :param query: The search query.
151 |         :return: A dictionary of search results.
152 |         """
153 |         try:
154 |             # Try to load existing index
155 |             vector_store = QdrantVectorStore(
156 |                 client=self.client,
157 |                 collection_name=self.vector_db_name,
158 |             )
159 |             storage_context = StorageContext.from_defaults(vector_store=vector_store)
160 |             index = VectorStoreIndex.from_vector_store(
161 |                 vector_store,
162 |                 storage_context=storage_context,
163 |                 embed_model=self.embed_model,
164 |             )
165 |         except Exception as _:
166 |             logger.info(
167 |                 f"Collection {self.vector_db_name} not found. Creating new index."
168 |             )
169 |             index = self.upload_embeddings()
170 | 
171 |         # Perform the search
172 |         retriever = index.as_retriever(similarity_top_k=self.limit)
173 |         results = retriever.retrieve(query)
174 | 
175 |         results_embeddings = [
176 |             {
177 |                 "id": node.metadata.get("id"),
178 |                 "text": node.text,
179 |                 "score": node.score if hasattr(node, "score") else None,
180 |                 "embeddings": node.metadata.get("embedding"),
181 |                 "url": node.metadata.get("url"),
182 |             }
183 |             for node in results
184 |         ]
185 | 
186 |         logger.info("relevant urls: %s", [r["url"] for r in results_embeddings])
187 |         return results_embeddings
188 | 
189 | 
190 | class ChatMistral:
191 |     def __init__(self, domain):
192 |         """
193 |         Initialize the ChatMistral with domain.
194 | 
195 |         :param domain: The domain to chat about.
196 |         """
197 |         self.domain = domain
198 |         self.embeddings = EmbeddingsVS(domain)
199 |         self.client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
200 |         self.model = "mistral-large-latest"
201 |         self.temperature = 0.7
202 |         self.names_to_functions = {
203 |             "search_context": functools.partial(self.embeddings.search),
204 |         }
205 |         self.tools = [
206 |             {
207 |                 "type": "function",
208 |                 "function": {
209 |                     "name": "search_context",
210 |                     "description": f"Use this tool to get more context about what you don't know. This tool allows you to have access to all the data of the website {self.domain}",
211 |                     "parameters": {
212 |                         "type": "object",
213 |                         "properties": {
214 |                             "query": {
215 |                                 "type": "string",
216 |                                 "description": "The search query to use for fetching data from the database",
217 |                             }
218 |                         },
219 |                         "required": ["query"],
220 |                     },
221 |                 },
222 |             },
223 |         ]
224 | 
225 |     def tools_to_str(tools_output: list) -> str:
226 |         """
227 |         Convert the tools output to a string.
228 | 
229 |         :param tools_output: The output from the tools.
230 |         :return: A string representation of the tools output.
231 |         """
232 |         return "\n".join([tool["text"] for tool in tools_output])
233 | 
234 |     def search_context(self, query: str):
235 |         """
236 |         Search the context for the given query.
237 | 
238 |         :param query: The search query.
239 |         :return: The search results.
240 |         """
241 |         results = self.embeddings.search(query)
242 |         return results
243 | 
244 |     def chat(self, query: str) -> Generator[str, None, None]:
245 |         """
246 |         Chat with the Mistral model.
247 |         It uses the official Mistral chat documentation with modifications to handle streaming tool calls.
248 | 
249 |         :param query: The chat query.
250 |         :return: A generator yielding chat responses.
251 |         """
252 |         system_message = "You are a helpful assistant. Be straightforward and helpful. Keep your answers short and to the point. You answer in the language spoken to you."
253 |         self.messages = [
254 |             {"role": "system", "content": system_message},
255 |             {"role": "user", "content": query},
256 |         ]
257 |         chat_response = self.client.chat.stream(
258 |             model=self.model,
259 |             messages=self.messages,
260 |             temperature=self.temperature,
261 |             tools=self.tools,
262 |             tool_choice="any",
263 |         )
264 |         tool_call = False
265 |         tool_call_data = None
266 |         message_to_add = ""
267 | 
268 |         for data in chat_response:
269 |             chunk = data.data.choices[0]
270 |             if hasattr(chunk, "delta"):
271 |                 delta = chunk.delta
272 |                 if hasattr(delta, "tool_calls") and delta.tool_calls:
273 |                     tool_call = True
274 |                     if not tool_call_data:
275 |                         tool_call_data = delta.tool_calls[0]
276 |                         function_name = tool_call_data.function.name
277 |                         function_args = tool_call_data.function.arguments
278 |                 if hasattr(delta, "content") and delta.content:
279 |                     message_to_add += delta.content
280 |                     yield delta.content
281 |             elif hasattr(chunk, "content") and chunk.content:
282 |                 message_to_add += chunk.content
283 |                 yield chunk.content
284 | 
285 |             if tool_call:
286 |                 break
287 | 
288 |         if tool_call:
289 |             print(f"tool call: {tool_call_data}")
290 |             print(
291 |                 f"debug: appending message: {AssistantMessage(content=message_to_add, tool_calls=[tool_call_data])}"
292 |             )
293 |             self.messages.append(
294 |                 AssistantMessage(
295 |                     content=message_to_add,
296 |                     tool_calls=[tool_call_data],
297 |                 )
298 |             )
299 |             if isinstance(function_args, str):
300 |                 try:
301 |                     function_args = json.loads(function_args)
302 |                 except json.JSONDecodeError:
303 |                     yield "Error processing your request."
304 |                     return
305 |             function_result = self.names_to_functions[function_name](**function_args)
306 |             function_result_text = ChatMistral.tools_to_str(function_result)
307 |             self.messages.append(
308 |                 ToolMessage(
309 |                     name=function_name,
310 |                     content=function_result_text,
311 |                     tool_call_id=tool_call_data.id,
312 |                 )
313 |             )
314 |             stream_response = self.client.chat.stream(
315 |                 model=self.model, messages=self.messages, temperature=self.temperature
316 |             )
317 |             final_response = ""
318 |             for data in stream_response:
319 |                 chunk = data.data.choices[0]
320 |                 if (
321 |                     hasattr(chunk, "delta")
322 |                     and hasattr(chunk.delta, "content")
323 |                     and chunk.delta.content
324 |                 ):
325 |                     final_response += chunk.delta.content
326 |                     yield chunk.delta.content
327 |                 elif hasattr(chunk, "content") and chunk.content:
328 |                     final_response += chunk.content
329 |                     yield chunk.content
330 |             self.messages.append(AssistantMessage(content=final_response))
331 |         else:
332 |             self.messages.append(AssistantMessage(content=message_to_add))
333 | 
334 | 
335 | class MainExecute:
336 |     def __init__(self, domain: str, load: bool = True) -> None:
337 |         """
338 |         Initialize the MainExecute class.
339 |         """
340 |         self.domain = domain
341 |         depth = 2
342 |         self.load = load
343 |         self.scraper = ScraperInterface(domain=domain, depth=depth)  # scrape first
344 |         self.embeddings = EmbeddingsVS(domain=domain)  # then upload the embeddings
345 |         self.chat = ChatMistral(domain=domain)  # then create the chat
346 | 
347 |         if self.load:
348 |             self.scraper.run_crawler()  # run the scraper
349 |             logger.info("Finished scraping.")
350 |             self.embeddings.upload_embeddings()  # upload the embeddings
351 |             logger.info("Finished uploading embeddings.")
352 | 
353 |     def ask(self, question: str):
354 |         """
355 |         Ask a question to the chatbot based on a url.
356 |         """
357 |         try:
358 |             output = ""
359 |             # Stream the response
360 |             for chunk in self.chat.chat(question):
361 |                 output += chunk
362 |                 yield chunk  # Continue yielding chunks as they arrive
363 | 
364 |             # Log the input and output using phospho
365 |             phospho.log(input=question, output=output)
366 | 
367 |         except KeyboardInterrupt:
368 |             logger.info("Exiting program.")
369 | 


--------------------------------------------------------------------------------
/app/interface/chat-bubble.js:
--------------------------------------------------------------------------------
  1 | const styles = `
  2 | * {
  3 |     box-sizing: border-box;
  4 |     margin: 0;
  5 |     padding: 0;
  6 |     font-family: 'Inter', sans-serif; /* Using a modern sans-serif font like Inter */
  7 | }
  8 | 
  9 | /* Chat Bubble (Floating button) */
 10 | .chat-bubble {
 11 |     position: fixed;
 12 |     bottom: 20px;
 13 |     right: 20px;
 14 |     width: 60px;
 15 |     height: 60px;
 16 |     background: #3e8ef7;
 17 |     border-radius: 50%;
 18 |     cursor: pointer;
 19 |     box-shadow: 0 2px 12px rgba(0, 0, 0, 0.1);
 20 |     display: flex;
 21 |     align-items: center;
 22 |     justify-content: center;
 23 |     transition: all 0.3s ease;
 24 |     z-index: 1000;
 25 | }
 26 | 
 27 | .chat-bubble:hover {
 28 |     transform: scale(1.1);
 29 | }
 30 | 
 31 | .chat-bubble-icon {
 32 |     font-size: 28px;
 33 | }
 34 | 
 35 | /* Chat Window Modal */
 36 | .chat-window {
 37 |     position: fixed;
 38 |     bottom: 90px;
 39 |     right: 20px;
 40 |     width: 350px;
 41 |     height: 450px;
 42 |     background: #1e1e1e;
 43 |     border-radius: 16px;
 44 |     box-shadow: 0 12px 40px rgba(0, 0, 0, 0.1);
 45 |     display: none;
 46 |     flex-direction: column;
 47 |     z-index: 1000;
 48 |     opacity: 0;
 49 |     transform: translateY(30px);
 50 |     transition: opacity 0.3s ease, transform 0.3s ease;
 51 | }
 52 | 
 53 | .chat-window.active {
 54 |     display: flex;
 55 |     opacity: 1;
 56 |     transform: translateY(0);
 57 |     background: #1e1e1e;
 58 | }
 59 | 
 60 | /* Header */
 61 | .chat-header {
 62 |     padding: 16px;
 63 |     background: #1e1e1e;
 64 |     color: white;
 65 |     font-size: 16px;
 66 |     font-weight: 600;
 67 |     display: flex;
 68 |     justify-content: space-between;
 69 |     align-items: center;
 70 |     border-top-left-radius: 16px;
 71 |     align-self: flex-end;
 72 |     border-top-right-radius: 16px;
 73 |     box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
 74 | }
 75 | 
 76 | .close-button {
 77 |     background: none;
 78 |     border: none;
 79 |     color: white;
 80 |     cursor: pointer;
 81 |     font-size: 18px;
 82 |     transition: color 0.3s ease;
 83 | }
 84 | 
 85 | .close-button:hover {
 86 |     color: #3e8ef7;
 87 | }
 88 | 
 89 | /* Chat Messages Section */
 90 | .chat-messages {
 91 |     flex-grow: 1;
 92 |     padding: 20px;
 93 |     overflow-y: auto;
 94 |     display: flex;
 95 |     flex-direction: column;
 96 |     gap: 14px;
 97 |     background: #1e1e1e;
 98 | }
 99 | 
100 | .message {
101 |     padding: 12px 18px;
102 |     border-radius: 20px;
103 |     max-width: 80%;
104 |     word-wrap: break-word;
105 |     font-size: 14px;
106 |     line-height: 1.4;
107 | }
108 | 
109 | .message.sent {
110 |     background: #2A2A2A;
111 |     color: white;
112 |     align-self: flex-end;
113 | }
114 | 
115 | .message.received {
116 |     background: #3e8ef7;
117 |     color: white;
118 |     align-self: flex-start;
119 | }
120 | 
121 | /* Chat Input Section */
122 | .chat-input {
123 |     display: flex;
124 |     padding: 16px;
125 |     gap: 12px;
126 |     background: #1e1e1e;
127 | }
128 | 
129 | .message-input {
130 |     flex-grow: 1;
131 |     padding: 12px 18px;
132 |     border: 1px solid #e0e0e0;
133 |     border-radius: 30px;
134 |     outline: none;
135 |     font-size: 14px;
136 |     transition: border-color 0.3s ease;
137 | }
138 | 
139 | .message-input:focus {
140 |     border-color: #3e8ef7;
141 | }
142 | 
143 | .send-button {
144 |     background: #3e8ef7;
145 |     color: white;
146 |     border: none;
147 |     border-radius: 30px;
148 |     padding: 0px 16px;
149 |     cursor: pointer;
150 |     transition: background 0.3s ease;
151 |     font-size: 24px;
152 |     align-items: center;
153 | }
154 | 
155 | .send-button:hover {
156 |     background: #3378d1;
157 | }
158 | 
159 | .send-button:active {
160 |     background: #2566a0;
161 | }
162 | 
163 | 
164 | .phospho-typing-indicator {
165 |   padding: 10px;
166 |   display: flex;
167 |   align-items: center;
168 | }
169 | 
170 | .typing-dots {
171 |   display: flex;
172 |   align-self: flex-start;
173 | }
174 | 
175 | .typing-dots span {
176 |   height: 8px;
177 |   width: 8px;
178 |   margin: 0 4px;
179 |   background-color: #3e8ef7;
180 |   display: block;
181 |   border-radius: 50%;
182 |   opacity: 0.4;
183 |   animation: typing 1s infinite ease-in-out;
184 |   align-self: flex-start;
185 | }
186 | 
187 | .typing-dots span:nth-child(1) {
188 |   animation-delay: 0.1s;
189 | }
190 | 
191 | .typing-dots span:nth-child(2) {
192 |   animation-delay: 0.2s;
193 | }
194 | 
195 | .typing-dots span:nth-child(3) {
196 |   animation-delay: 0.3s;
197 | }
198 | 
199 | .highlighted-link {
200 |   text-decoration: underline;
201 |   color: rgb(255, 255, 255);
202 | }
203 | 
204 | /* Copy button styling */
205 | .copy-btn {
206 |   position: absolute;
207 |   top: 5px;
208 |   right: 5px;
209 |   background-color: #444;
210 |   color: #e6e6e6;
211 |   border: none;
212 |   border-radius: 3px;
213 |   padding: 2px 5px;
214 |   font-size: 12px;
215 |   cursor: pointer;
216 | }
217 | 
218 | .copy-btn:hover {
219 |   background-color: #555;
220 | }
221 | 
222 | 
223 | /* Code block styling */
224 | pre {
225 |   background-color: #2b2b2b;
226 |   border: 1px solid #444;
227 |   border-radius: 4px;
228 |   padding: 10px;
229 |   overflow-x: auto;
230 |   position: relative;
231 | }
232 | 
233 | code {
234 |   font-family: 'Courier New', Courier, monospace;
235 |   font-size: 14px;
236 |   color: #2b2b2b;
237 | }
238 | 
239 | /* Inline code styling */
240 | p code {
241 |   background-color: #2c2f33;
242 |   color: #e6e6e6;
243 |   padding: 2px 4px;
244 |   border-radius: 3px;
245 | }
246 | 
247 | /* Fade In Animation */
248 | .chat-window.fade-in {
249 |     animation: fadeIn 0.3s forwards;
250 | }
251 | 
252 | @media (max-width: 767px) {
253 |   .chat-window {
254 |     width: 100%;
255 |     height: 100%;
256 |     border-radius: 0;
257 |     bottom: 0;
258 |     right: 0;
259 |   }
260 | 
261 |   .chat-messages {
262 |     max-height: 70%;
263 |   }
264 | 
265 |   .chat-input {
266 |     flex-wrap: wrap;
267 |   }
268 | 
269 |   .message-input {
270 |     flex-grow: 1;
271 |     width: 100%;
272 |   }
273 | 
274 |   .send-button {
275 |     margin-top: 12px;
276 |     font-size: 40px;
277 |   }
278 | }
279 | 
280 | @keyframes typing {
281 |   0% {
282 |     transform: translateY(0px);
283 |     background-color: #0000ff;
284 |   }
285 |   28% {
286 |     transform: translateY(-7px);
287 |     background-color: #3e8ef7;
288 |   }
289 |   44% {
290 |     transform: translateY(0px);
291 |     background-color: #0000ff;
292 |   }
293 | }
294 | 
295 | @keyframes fadeIn {
296 |     from {
297 |         opacity: 0;
298 |         transform: translateY(30px);
299 |     }
300 |     to {
301 |         opacity: 1;
302 |         transform: translateY(0);
303 |     }
304 | }`;
305 | 
306 | 
307 | const apiUrl = `${process.env.SERVER_URL}/question_on_url`;
308 | 
309 | // Create and inject stylesheet
310 | const styleSheet = document.createElement("style");
311 | styleSheet.textContent = styles;
312 | document.head.appendChild(styleSheet);
313 | 
314 | // Create chat elements
315 | const chatBubble = document.createElement("div");
316 | chatBubble.className = "chat-bubble";
317 | chatBubble.innerHTML = '<span class="chat-bubble-icon">💬</span>';
318 | document.body.appendChild(chatBubble);
319 | 
320 | const chatWindow = document.createElement("div");
321 | chatWindow.className = "chat-window";
322 | chatWindow.innerHTML = `
323 |   <div class="chat-header">
324 |     <button class="close-button"></button>
325 |   </div>
326 |   <div class="chat-messages"></div>
327 |   <div class="chat-input">
328 |     <input type="text" class="message-input" placeholder="Type a message...">
329 |     <button class="send-button">↑</button>
330 |   </div>
331 | `;
332 | 
333 | // Add elements to DOM
334 | 
335 | document.body.appendChild(chatWindow);
336 | 
337 | // Get references to elements
338 | const closeButton = chatWindow.querySelector(".close-button");
339 | const messagesContainer = chatWindow.querySelector(".chat-messages");
340 | const messageInput = chatWindow.querySelector(".message-input");
341 | const sendButton = chatWindow.querySelector(".send-button");
342 | 
343 | // Chat state
344 | let isOpen = false;
345 | 
346 | addMessage("Hello! How can I assist you today?", false);
347 | 
348 | // Functions
349 | function toggleChat() {
350 |   isOpen = !isOpen;
351 |   chatWindow.classList.toggle("active", isOpen);
352 |   chatBubble.querySelector(".chat-bubble-icon").textContent = isOpen ? "⌑" : "💬";
353 |   if (isOpen) {
354 |     messageInput.focus();
355 |   }
356 | }
357 | 
358 | function closeChat() {
359 |   isOpen = false;
360 |   chatBubble.querySelector(".chat-bubble-icon").textContent = isOpen ? "⌑" : "💬";
361 |   chatWindow.classList.remove("active");
362 | }
363 | 
364 | function addMessage(text, isSent = true) {
365 |   const message = document.createElement("div");
366 |   message.className = `message ${isSent ? "sent" : "received"}`;
367 |   message.textContent = text;
368 |   messagesContainer.appendChild(message);
369 |   messagesContainer.scrollTop = messagesContainer.scrollHeight;
370 | }
371 | 
372 | function updateBotMessage(text) {
373 |   hideTypingIndicator();
374 |   let botMessage = messagesContainer.querySelector(".message.received:last-child");
375 |   if (!botMessage) {
376 |     botMessage = document.createElement('div');
377 |     botMessage.className = 'message received';
378 |     botMessage.innerHTML = '<p></p>';
379 |     messagesContainer.appendChild(botMessage);
380 |   }
381 |   const processedText = processSpecialFormats(text);
382 |   
383 |   botMessage.querySelector('p').innerHTML = processedText;
384 |   addCopyButtons(botMessage);
385 |   messagesContainer.scrollTop = messagesContainer.scrollHeight;
386 | }
387 | 
388 | function showTypingIndicator() {
389 |     hideTypingIndicator(); // Remove any existing indicator first
390 |     const typingIndicator = document.createElement("div");
391 |     typingIndicator.className =
392 |       "message bot typing-indicator";
393 |     typingIndicator.innerHTML = `<div class="typing-dots"><span></span><span></span><span></span></div>`;
394 |     messagesContainer.appendChild(typingIndicator);
395 |     messagesContainer.scrollTop = messagesContainer.scrollHeight;
396 | }
397 |   
398 | function hideTypingIndicator() {
399 |   const typingIndicator = messagesContainer.querySelector(
400 |     ".typing-indicator"
401 |   );
402 |   if (typingIndicator) {
403 |     typingIndicator.remove();
404 |   }
405 | }
406 | 
407 | async function handleSendMessage() {
408 |   const text = messageInput.value.trim();
409 |   if (text) {
410 |     addMessage(text, true);
411 |     messageInput.value = "";
412 |     showTypingIndicator();
413 | 
414 |     try {
415 |       const response = await fetch(apiUrl, {
416 |         method: "POST",
417 |         headers: {
418 |           "Content-Type": "application/json",
419 |         },
420 |         body: JSON.stringify({ question: text }),
421 |       });
422 | 
423 |       if (response.ok) {
424 |         // Check if the response is a StreamingResponse (ReadableStream)
425 |         if (response.body instanceof ReadableStream) {
426 |           const reader = response.body.getReader();
427 |           const decoder = new TextDecoder();
428 |           let receivedText = '';
429 | 
430 |           // Function to read chunks from the stream
431 |           const readStream = async () => {
432 |             const { done, value } = await reader.read();
433 |             if (done) {
434 |               hideTypingIndicator();
435 |               return; // Done reading the stream
436 |             }
437 |             receivedText += decoder.decode(value, { stream: true });  // Append the chunk
438 |             console.log("receivedText", receivedText);
439 |             updateBotMessage(receivedText); // Update the bot message with the new chunk
440 |             readStream(); // Continue reading the next chunk
441 |           };
442 | 
443 |           await readStream(); // Start reading the stream
444 |         } else {
445 |           hideTypingIndicator();
446 |           console.error("Error: Response is not a StreamingResponse");
447 |         }
448 |       } else {
449 |         hideTypingIndicator();
450 |         console.error("Error with the request:", response.statusText);
451 |       }
452 |     } catch (error) {
453 |       hideTypingIndicator();
454 |       console.error("Error:", error);
455 |     }
456 |   }
457 | }
458 | 
459 | function processSpecialFormats(text) {
460 |   // Process code blocks
461 |   text = text.replace(/```(\w+)?\n([\s\S]*?)```/g, (match, lang, code) => {
462 |     return `<pre><code class="language-${lang || ''}">${escapeHtml(code.trim())}</code></pre>`;
463 |   });
464 | 
465 |   // Process inline code
466 |   text = text.replace(/`([^`]+)`/g, '<code>$1</code>');
467 | 
468 |   // Process LaTeX
469 |   text = text.replace(/\$\$([\s\S]*?)\$\$/g, (match, latex) => {
470 |     return `<div class="latex-container">${latex}</div>`;
471 |   });
472 | 
473 |   // Simple Markdown-like parsing
474 |   text = text
475 |     // Headers
476 |     .replace(/^### (.*$)/gim, '<h3>$1</h3>')
477 |     .replace(/^## (.*$)/gim, '<h2>$1</h2>')
478 |     .replace(/^# (.*$)/gim, '<h1>$1</h1>')
479 |     // Bold
480 |     .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
481 |     // Italic
482 |     .replace(/\*(.*?)\*/g, '<em>$1</em>')
483 |     // Lists
484 |     .replace(/^\s*\n\*/gm, '<ul>\n*')
485 |     .replace(/^(\*\s.*)\n([^\*])/gm, '$1\n</ul>\n\n$2')
486 |     .replace(/^\*\s(.*)/gm, '<li>$1</li>')
487 |     // Links
488 |     .replace(/\[([^\]]+)\]\(([^\)]+)\)/g, '<a href="$2" class="highlighted-link" target="_blank">$1</a>')
489 |     // Paragraphs
490 |     .replace(/^\s*(\n)?(.+)/gm, function(m) {
491 |       return /\<(\/)?(h\d|ul|ol|li|blockquote|pre|img)/.test(m) ? m : '<p>'+m+'</p>';
492 |     });
493 | 
494 |   return text;
495 | }
496 | 
497 | function escapeHtml(unsafe) {
498 |   return unsafe
499 |     .replace(/&/g, "&amp;")
500 |     .replace(/</g, "&lt;")
501 |     .replace(/>/g, "&gt;")
502 |     .replace(/"/g, "&quot;")
503 |     .replace(/'/g, "&#039;");
504 | }
505 | 
506 | function addCopyButtons(container) {
507 |   const codeBlocks = container.querySelectorAll('pre');
508 |   codeBlocks.forEach((block) => {
509 |     if (!block.querySelector('.copy-btn')) {
510 |       const copyBtn = document.createElement('button');
511 |       copyBtn.textContent = 'Copy';
512 |       copyBtn.className = 'copy-btn';
513 |       copyBtn.addEventListener('click', () => {
514 |         const code = block.querySelector('code').textContent;
515 |         navigator.clipboard.writeText(code).then(() => {
516 |           copyBtn.textContent = 'Copied!';
517 |           setTimeout(() => {
518 |             copyBtn.textContent = 'Copy';
519 |           }, 2000);
520 |         });
521 |       });
522 |       block.appendChild(copyBtn);
523 |     }
524 |   });
525 | }
526 | 
527 | // Event listeners
528 | chatBubble.addEventListener("click", toggleChat);
529 | closeButton.addEventListener("click", closeChat);
530 | sendButton.addEventListener("click", handleSendMessage);
531 | 
532 | messageInput.addEventListener("keypress", (e) => {
533 |   if (e.key === "Enter") {
534 |     handleSendMessage();
535 |   }
536 | });
537 | 
538 | // Close chat when clicking outside
539 | document.addEventListener("click", (e) => {
540 |   if (
541 |     isOpen &&
542 |     !chatWindow.contains(e.target) &&
543 |     !chatBubble.contains(e.target)
544 |   ) {
545 |     closeChat();
546 |   }
547 | });
548 | 
549 | 


--------------------------------------------------------------------------------