├── nginx ├── Dockerfile └── nginx.conf ├── env.sh ├── .gitignore ├── api ├── wsgi.py ├── extensions.py ├── run.py ├── database │ ├── schemas │ │ ├── user_schema.py │ │ ├── chat_history_schema.py │ │ ├── functions_history_schema.py │ │ ├── resources_schema.py │ │ ├── knowledge_base_schema.py │ │ └── personal_memory_schema.py │ ├── models │ │ ├── user.py │ │ ├── chat_history.py │ │ ├── functions_history.py │ │ ├── resources.py │ │ ├── knowledge_base.py │ │ └── personal_memory.py │ └── db_manager.py ├── routes │ ├── clear_context_view.py │ ├── test_view.py │ ├── db_conversation_id_view.py │ ├── login_view.py │ ├── check_english_view.py │ ├── yt_summary_view.py │ ├── web_page_summary_view.py │ └── chat_view.py └── __init__.py ├── docs ├── logo.png ├── diagram.png ├── logo.py ├── scripts.py ├── todo.md ├── jupyter_notebooks │ ├── ollama.ipynb │ ├── web_summary.ipynb │ ├── yt_video_summary.ipynb │ └── document_rag.ipynb ├── feature_ideas.md └── notes.md ├── discord_bot ├── Dockerfile ├── .env.dist ├── bot_commands.py ├── config.py ├── utils.py └── assistant_bot.py ├── Dockerfile-env ├── update_container.sh ├── Dockerfile ├── docker-compose.local.yml ├── .env.dist ├── alembic ├── README ├── script.py.mako ├── versions │ ├── b6994a6fb482_add_context_column_to_functions_history.py │ ├── 7fbcdb262a79_increase_users_password_hash_length.py │ ├── 264699802ec3_create_users_table.py │ ├── e3fab275bece_add_chat_history_table.py │ ├── 0f632f48bc6d_add_functions_history_table.py │ ├── 6409f4f8492b_add_resources_table.py │ ├── 265ab9e632ed_add_personal_memory_table.py │ └── e62d40842589_add_knowledge_base_table.py └── env.py ├── docker-compose.workflows.yml ├── docker-compose.yml ├── tests └── test_assistant_api.py ├── README.md ├── alembic.ini ├── requirements.txt └── .github └── workflows └── basic.yml /nginx/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nginx 2 | COPY ./nginx.conf /etc/nginx/nginx.conf -------------------------------------------------------------------------------- /env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker build --no-cache -t assistant_env -f Dockerfile-env . -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | __pycache__/ 3 | actions-runner/ 4 | .DS_Store 5 | .vscode 6 | .history -------------------------------------------------------------------------------- /api/wsgi.py: -------------------------------------------------------------------------------- 1 | from api.run import app 2 | 3 | if __name__ == "__main__": 4 | app.run() 5 | -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/janbanot/personal_assistant/HEAD/docs/logo.png -------------------------------------------------------------------------------- /docs/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/janbanot/personal_assistant/HEAD/docs/diagram.png -------------------------------------------------------------------------------- /discord_bot/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM assistant_env 2 | WORKDIR /usr/src/app 3 | COPY .env .env 4 | COPY . . 5 | CMD ["python3", "assistant_bot.py"] -------------------------------------------------------------------------------- /Dockerfile-env: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | WORKDIR /usr/src/app 3 | COPY requirements.txt ./ 4 | RUN pip install --no-cache-dir -r requirements.txt -------------------------------------------------------------------------------- /discord_bot/.env.dist: -------------------------------------------------------------------------------- 1 | DISCORD_TOKEN=XYZ 2 | DISCORD_GUILD_ID=1234 3 | API_USER_EMAIL=test@email.com 4 | API_PASSWORD=xyz 5 | API_URL=localhost:8081/ -------------------------------------------------------------------------------- /update_container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | service="assistant_$1" 4 | 5 | docker-compose build $service 6 | docker-compose up -d --no-deps $service -------------------------------------------------------------------------------- /api/extensions.py: -------------------------------------------------------------------------------- 1 | from flask_sqlalchemy import SQLAlchemy 2 | from flask_marshmallow import Marshmallow 3 | from flask_jwt_extended import JWTManager 4 | 5 | db = SQLAlchemy() 6 | ma = Marshmallow() 7 | jwt = JWTManager() 8 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM assistant_env 2 | WORKDIR /usr/src/app 3 | COPY .env .env 4 | COPY api ./api 5 | COPY tests ./tests 6 | COPY alembic.ini . 7 | COPY alembic ./alembic 8 | EXPOSE 8080 9 | CMD [ "gunicorn", "--bind", "0.0.0.0:8080", "--timeout", "180", "api.wsgi:app" ] -------------------------------------------------------------------------------- /nginx/nginx.conf: -------------------------------------------------------------------------------- 1 | events { worker_connections 1024; } 2 | 3 | http { 4 | server { 5 | listen 8081; 6 | server_name localhost; 7 | 8 | location / { 9 | proxy_pass http://assistant_api:8080; 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /api/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | from api import create_app 4 | 5 | load_dotenv() 6 | app = create_app() 7 | 8 | if __name__ == "__main__": 9 | is_debug = os.getenv("FLASK_DEBUG_MODE", "False").lower() == "true" 10 | app.run(debug=is_debug) 11 | -------------------------------------------------------------------------------- /api/database/schemas/user_schema.py: -------------------------------------------------------------------------------- 1 | from marshmallow_sqlalchemy import SQLAlchemyAutoSchema 2 | from api.database.models.user import User 3 | 4 | 5 | class UserSchema(SQLAlchemyAutoSchema): 6 | class Meta: 7 | model = User 8 | fields = ("id", "email", "login", "password_hash") 9 | -------------------------------------------------------------------------------- /docker-compose.local.yml: -------------------------------------------------------------------------------- 1 | services: 2 | assistant_proxy: 3 | build: 4 | context: ./nginx 5 | args: 6 | env: local 7 | dockerfile: Dockerfile 8 | ports: 9 | - "8081:8081" 10 | depends_on: 11 | - assistant_api 12 | assistant_qdrant: 13 | ports: 14 | - "6333:6333" -------------------------------------------------------------------------------- /.env.dist: -------------------------------------------------------------------------------- 1 | FLASK_SECRET_KEY=XYZ 2 | FLASK_DEBUG_MODE=False 3 | POSTGRES_URL=postgresql://user:pass@db:5432/dbname 4 | POSTGRES_USER=user 5 | POSTGRES_PASSWORD=XYZ 6 | POSTGRES_DB=user_db 7 | OPENAI_API_KEY=sk-Abc 8 | LANGCHAIN_API_KEY=ls__abc 9 | LANGCHAIN_TRACING_V2=true 10 | LANGCHAIN_PROJECT=project_name 11 | LANGCHAIN_ENDPOINT=https://api.smith.langchain.com -------------------------------------------------------------------------------- /alembic/README: -------------------------------------------------------------------------------- 1 | # Alembic Migrations 2 | Examples of using Alembic to manage database migrations: 3 | - to apply migrations to the database, run the following command: 4 | ```bash 5 | alembic upgrade head 6 | ``` 7 | - to apply migration in docker-compose, run the following command: 8 | ```bash 9 | docker-compose exec assistant_api alembic upgrade head 10 | ``` -------------------------------------------------------------------------------- /api/database/schemas/chat_history_schema.py: -------------------------------------------------------------------------------- 1 | from marshmallow_sqlalchemy import SQLAlchemyAutoSchema 2 | from api.database.models.chat_history import ChatHistory 3 | 4 | 5 | class UserSchema(SQLAlchemyAutoSchema): 6 | class Meta: 7 | model = ChatHistory 8 | fields = ("message_id", "conversation_id", "user_message", "current_context", "answer") 9 | -------------------------------------------------------------------------------- /api/database/models/user.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String 2 | from api.extensions import db 3 | 4 | 5 | class User(db.Model): # type: ignore 6 | __tablename__ = "users" 7 | id = Column(Integer, primary_key=True) 8 | email = Column(String, unique=True) 9 | login = Column(String, unique=True) 10 | password_hash = Column(String) 11 | -------------------------------------------------------------------------------- /api/routes/clear_context_view.py: -------------------------------------------------------------------------------- 1 | from flask import jsonify 2 | from flask.views import MethodView 3 | from flask_jwt_extended import jwt_required 4 | from api.routes.chat_view import context_memory 5 | 6 | 7 | class ClearView(MethodView): 8 | decorators = [jwt_required()] 9 | 10 | def post(self): 11 | context_memory.clear() 12 | return jsonify({"message": "Context memory cleared"}) 13 | -------------------------------------------------------------------------------- /api/database/models/chat_history.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String 2 | from api.extensions import db 3 | 4 | 5 | class ChatHistory(db.Model): # type: ignore 6 | __tablename__ = "chat_history" 7 | message_id = Column(Integer, primary_key=True) 8 | conversation_id = Column(Integer) 9 | user_message = Column(String) 10 | current_context = Column(String) 11 | answer = Column(String) 12 | -------------------------------------------------------------------------------- /api/routes/test_view.py: -------------------------------------------------------------------------------- 1 | from flask import jsonify, request, current_app 2 | from flask.views import MethodView 3 | from flask_jwt_extended import jwt_required 4 | 5 | 6 | class TestView(MethodView): 7 | decorators = [jwt_required()] 8 | 9 | def get(self): 10 | data = "hello world" 11 | current_app.logger.info("Request: %s", request) 12 | current_app.logger.info("Response: %s", data) 13 | return jsonify({"data": data}) 14 | -------------------------------------------------------------------------------- /docs/logo.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | from openai import OpenAI 4 | 5 | load_dotenv() 6 | openai_key = os.getenv("OPENAI_API_KEY") 7 | client = OpenAI(api_key=openai_key) 8 | 9 | response = client.images.generate( 10 | model="dall-e-3", 11 | prompt="a neon emblem logo of a robot personal assistant, simple, vector, color", 12 | size="1024x1024", 13 | quality="standard", 14 | n=1, 15 | ) 16 | 17 | image_url = response.data[0].url 18 | print(image_url) 19 | -------------------------------------------------------------------------------- /docker-compose.workflows.yml: -------------------------------------------------------------------------------- 1 | services: 2 | assistant_api: 3 | labels: 4 | - "workflow=${GITHUB_RUN_ID}" 5 | assistant_proxy: 6 | labels: 7 | - "workflow=${GITHUB_RUN_ID}" 8 | assistant_db: 9 | labels: 10 | - "workflow=${GITHUB_RUN_ID}" 11 | assistant_qdrant: 12 | labels: 13 | - "workflow=${GITHUB_RUN_ID}" 14 | assistant_bot: 15 | labels: 16 | - "workflow=${GITHUB_RUN_ID}" 17 | assistant_test: 18 | labels: 19 | - "workflow=${GITHUB_RUN_ID}" -------------------------------------------------------------------------------- /api/database/schemas/functions_history_schema.py: -------------------------------------------------------------------------------- 1 | from marshmallow_sqlalchemy import SQLAlchemyAutoSchema 2 | from api.database.models.functions_history import FunctionsHistory 3 | 4 | 5 | class FunctionsHistorySchema(SQLAlchemyAutoSchema): 6 | class Meta: 7 | model = FunctionsHistory 8 | fields = ( 9 | "id", 10 | "interaction_id", 11 | "function", 12 | "user_input", 13 | "answer", 14 | "created_at", 15 | "context" 16 | ) 17 | -------------------------------------------------------------------------------- /api/database/schemas/resources_schema.py: -------------------------------------------------------------------------------- 1 | from marshmallow_sqlalchemy import SQLAlchemyAutoSchema 2 | from api.database.models.resources import Resource 3 | 4 | 5 | class ResourceSchema(SQLAlchemyAutoSchema): 6 | class Meta: 7 | model = Resource 8 | fields = ( 9 | "id", 10 | "name", 11 | "content", 12 | "url", 13 | "tags", 14 | "category", 15 | "active", 16 | "created_at", 17 | "updated_at", 18 | ) 19 | -------------------------------------------------------------------------------- /api/routes/db_conversation_id_view.py: -------------------------------------------------------------------------------- 1 | from flask import jsonify 2 | from flask.views import MethodView 3 | from flask_jwt_extended import jwt_required 4 | from api.database.db_manager import DBManager 5 | 6 | 7 | class DBConversationIdView(MethodView): 8 | decorators = [jwt_required()] 9 | 10 | def __init__(self): 11 | self.db_manager = DBManager() 12 | 13 | def get(self): 14 | conversation_id = self.db_manager.get_current_conversation_id() 15 | return jsonify({"conversation_id": conversation_id}) 16 | -------------------------------------------------------------------------------- /api/database/schemas/knowledge_base_schema.py: -------------------------------------------------------------------------------- 1 | from marshmallow_sqlalchemy import SQLAlchemyAutoSchema 2 | from api.database.models.knowledge_base import KnowledgeBase 3 | 4 | 5 | class KnowledgeBaseSchema(SQLAlchemyAutoSchema): 6 | class Meta: 7 | model = KnowledgeBase 8 | fields = ( 9 | "id", 10 | "category", 11 | "tag", 12 | "content", 13 | "source", 14 | "created_at", 15 | "updated_at", 16 | "last_accessed_at", 17 | "active", 18 | ) 19 | -------------------------------------------------------------------------------- /api/database/schemas/personal_memory_schema.py: -------------------------------------------------------------------------------- 1 | from marshmallow_sqlalchemy import SQLAlchemyAutoSchema 2 | from api.database.models.personal_memory import PersonalMemory 3 | 4 | 5 | class PersonalMemorySchema(SQLAlchemyAutoSchema): 6 | class Meta: 7 | model = PersonalMemory 8 | fields = ( 9 | "id", 10 | "name", 11 | "description", 12 | "source", 13 | "category", 14 | "tags", 15 | "created_at", 16 | "updated_at", 17 | "active", 18 | ) 19 | -------------------------------------------------------------------------------- /api/database/models/functions_history.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String, Text, DateTime 2 | from api.extensions import db 3 | import sqlalchemy as sa 4 | 5 | 6 | class FunctionsHistory(db.Model): # type: ignore 7 | __tablename__ = "functions_history" 8 | id = Column(Integer, primary_key=True) 9 | interaction_id = Column(Integer, nullable=False) 10 | function = Column(String(255), nullable=False) 11 | user_input = Column(Text, nullable=False) 12 | answer = Column(Text, nullable=False) 13 | created_at = Column(DateTime, nullable=False, server_default=sa.func.now()) 14 | context = Column(Text, nullable=True) 15 | -------------------------------------------------------------------------------- /alembic/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | ${imports if imports else ""} 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = ${repr(up_revision)} 16 | down_revision: Union[str, None] = ${repr(down_revision)} 17 | branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} 18 | depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} 19 | 20 | 21 | def upgrade() -> None: 22 | ${upgrades if upgrades else "pass"} 23 | 24 | 25 | def downgrade() -> None: 26 | ${downgrades if downgrades else "pass"} 27 | -------------------------------------------------------------------------------- /alembic/versions/b6994a6fb482_add_context_column_to_functions_history.py: -------------------------------------------------------------------------------- 1 | """add context column to functions history 2 | 3 | Revision ID: b6994a6fb482 4 | Revises: 6409f4f8492b 5 | Create Date: 2024-06-22 09:34:02.802263 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = 'b6994a6fb482' 16 | down_revision: Union[str, None] = '6409f4f8492b' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | op.add_column('functions_history', sa.Column('context', sa.Text(), nullable=True)) 23 | 24 | 25 | def downgrade() -> None: 26 | op.drop_column('functions_history', 'context') 27 | -------------------------------------------------------------------------------- /api/database/models/resources.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean 2 | from api.extensions import db 3 | import sqlalchemy as sa 4 | 5 | 6 | class Resource(db.Model): # type: ignore 7 | __tablename__ = "resources" 8 | id = Column(Integer, primary_key=True) 9 | name = Column(String(255), nullable=False) 10 | content = Column(Text, nullable=True) 11 | url = Column(String(255), nullable=True) 12 | tags = Column(String(255), nullable=True) 13 | category = Column(String(255), nullable=False) 14 | active = Column(Boolean, nullable=False, server_default="true") 15 | created_at = Column(DateTime, nullable=False, server_default=sa.func.now()) 16 | updated_at = Column( 17 | DateTime, nullable=False, server_default=sa.func.now(), onupdate=sa.func.now() 18 | ) 19 | -------------------------------------------------------------------------------- /api/database/models/knowledge_base.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean 2 | from api.extensions import db 3 | import sqlalchemy as sa 4 | 5 | 6 | class KnowledgeBase(db.Model): # type: ignore 7 | __tablename__ = "knowledge_base" 8 | id = Column(Integer, primary_key=True) 9 | category = Column(String(255), nullable=False) 10 | tag = Column(String(255), nullable=False) 11 | content = Column(Text, nullable=False) 12 | source = Column(String(255), nullable=True) 13 | created_at = Column(DateTime, nullable=False, server_default=sa.func.now()) 14 | updated_at = Column( 15 | DateTime, nullable=False, server_default=sa.func.now(), onupdate=sa.func.now() 16 | ) 17 | last_accessed_at = Column(DateTime, nullable=True) 18 | active = Column(Boolean, nullable=False, server_default="true") 19 | -------------------------------------------------------------------------------- /api/database/models/personal_memory.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean 2 | from api.extensions import db 3 | import sqlalchemy as sa 4 | 5 | 6 | class PersonalMemory(db.Model): # type: ignore 7 | __tablename__ = "personal_memory" 8 | id = Column(Integer, primary_key=True) 9 | name = Column(String(255), nullable=False) 10 | description = Column(Text, nullable=True) 11 | source = Column(String(255), nullable=True) 12 | category = Column(String(255), nullable=False) 13 | tags = Column(String(255), nullable=True) 14 | created_at = Column(DateTime, nullable=False, server_default=sa.func.now()) 15 | updated_at = Column( 16 | DateTime, nullable=False, server_default=sa.func.now(), onupdate=sa.func.now() 17 | ) 18 | active = Column(Boolean, nullable=False, server_default="true") 19 | -------------------------------------------------------------------------------- /discord_bot/bot_commands.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from collections import namedtuple 3 | 4 | Command = namedtuple("Command", ["name", "description"]) 5 | 6 | 7 | class BotCommands(Enum): 8 | LIST_COMMANDS = Command("list-commands", "list all available commands") 9 | YT_SUMMARY = Command( 10 | "yt-summary", "get a summary of a YouTube video, provide a URL" 11 | ) 12 | PAGE_SUMMARY = Command( 13 | "page-summary", "get a summary of a page, provide a URL" 14 | ) 15 | CHECK_ENGLISH = Command( 16 | "check-english", 17 | "check and fix grammatical, spelling, and punctuation errors in English text", 18 | ) 19 | 20 | 21 | def get_bot_commands(): 22 | commands = [] 23 | for command in BotCommands: 24 | commands.append(f"- **{command.value.name}**: {command.value.description}") 25 | return "\n".join(commands) 26 | -------------------------------------------------------------------------------- /alembic/versions/7fbcdb262a79_increase_users_password_hash_length.py: -------------------------------------------------------------------------------- 1 | """increase users.password_hash length 2 | 3 | Revision ID: 7fbcdb262a79 4 | Revises: 264699802ec3 5 | Create Date: 2024-03-06 00:18:57.968757 6 | 7 | """ 8 | 9 | from typing import Sequence, Union 10 | 11 | from alembic import op 12 | import sqlalchemy as sa 13 | 14 | 15 | # revision identifiers, used by Alembic. 16 | revision: str = "7fbcdb262a79" 17 | down_revision: Union[str, None] = "264699802ec3" 18 | branch_labels: Union[str, Sequence[str], None] = None 19 | depends_on: Union[str, Sequence[str], None] = None 20 | 21 | 22 | def upgrade(): 23 | op.alter_column( 24 | "users", "password_hash", existing_type=sa.String(128), type_=sa.String(256) 25 | ) 26 | 27 | 28 | def downgrade(): 29 | op.alter_column( 30 | "users", "password_hash", existing_type=sa.String(256), type_=sa.String(128) 31 | ) 32 | -------------------------------------------------------------------------------- /alembic/versions/264699802ec3_create_users_table.py: -------------------------------------------------------------------------------- 1 | """create users table 2 | 3 | Revision ID: 264699802ec3 4 | Create Date: 2024-03-03 18:25:23.656599 5 | 6 | """ 7 | 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = "264699802ec3" 16 | down_revision: Union[str, None] = None 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade(): 22 | op.create_table( 23 | "users", 24 | sa.Column("id", sa.Integer, primary_key=True), 25 | sa.Column("email", sa.String(120), unique=True, nullable=False), 26 | sa.Column("login", sa.String(64), unique=True, nullable=False), 27 | sa.Column("password_hash", sa.String(128), nullable=False), 28 | ) 29 | 30 | 31 | def downgrade(): 32 | op.drop_table("users") 33 | -------------------------------------------------------------------------------- /alembic/versions/e3fab275bece_add_chat_history_table.py: -------------------------------------------------------------------------------- 1 | """add chat_history table 2 | 3 | Revision ID: e3fab275bece 4 | Revises: 7fbcdb262a79 5 | Create Date: 2024-05-13 07:16:11.484875 6 | 7 | """ 8 | 9 | from typing import Sequence, Union 10 | 11 | from alembic import op 12 | import sqlalchemy as sa 13 | 14 | 15 | # revision identifiers, used by Alembic. 16 | revision: str = "e3fab275bece" 17 | down_revision: Union[str, None] = "7fbcdb262a79" 18 | branch_labels: Union[str, Sequence[str], None] = None 19 | depends_on: Union[str, Sequence[str], None] = None 20 | 21 | 22 | def upgrade(): 23 | op.create_table( 24 | "chat_history", 25 | sa.Column("message_id", sa.Integer, primary_key=True), 26 | sa.Column("conversation_id", sa.Integer), 27 | sa.Column("user_message", sa.String), 28 | sa.Column("current_context", sa.String), 29 | sa.Column("answer", sa.String), 30 | ) 31 | 32 | 33 | def downgrade(): 34 | op.drop_table("chat_history") 35 | -------------------------------------------------------------------------------- /api/database/db_manager.py: -------------------------------------------------------------------------------- 1 | from api.database.models.chat_history import ChatHistory 2 | from api.extensions import db 3 | 4 | 5 | class DBManager: 6 | def save_message(self, conversation_id, user_message, current_context, answer): 7 | new_message = ChatHistory( 8 | conversation_id=conversation_id, 9 | user_message=user_message, 10 | current_context=current_context, 11 | answer=answer, 12 | ) 13 | db.session.add(new_message) 14 | db.session.commit() 15 | 16 | def get_messages_by_conversation(self, conversation_id): 17 | messages = ( 18 | db.session.query(ChatHistory) 19 | .filter(ChatHistory.conversation_id == conversation_id) 20 | .all() 21 | ) 22 | return messages 23 | 24 | def get_current_conversation_id(self): 25 | latest_message = db.session.query(ChatHistory).order_by(ChatHistory.message_id.desc()).first() 26 | return latest_message.conversation_id if latest_message else 1 27 | -------------------------------------------------------------------------------- /docs/scripts.py: -------------------------------------------------------------------------------- 1 | # To run the script you need to execute bash in a runnig api container 2 | # create a file, paste the content using the cat command (https://stackoverflow.com/a/60224966) 3 | # cat > file_to_edit 4 | # 1 Write or Paste you text 5 | # 2 don't forget to leave a blank line at the end of file 6 | # 3 Ctrl + C to apply configuration 7 | # and run the script 8 | from api.run import app 9 | from api.extensions import db 10 | from api.database.models.user import User 11 | from werkzeug.security import generate_password_hash 12 | 13 | 14 | def create_admin_user(email: str, login: str, password: str): 15 | hashed_password = generate_password_hash(password) 16 | new_user = User(email=email, login=login, password_hash=hashed_password) 17 | db.session.add(new_user) 18 | db.session.commit() 19 | 20 | 21 | if __name__ == "__main__": 22 | with app.app_context(): # Create an application context 23 | email = input("Enter email: ") 24 | login = input("Enter login: ") 25 | password = input("Enter password: ") 26 | create_admin_user(email, login, password) 27 | print("User created") 28 | -------------------------------------------------------------------------------- /alembic/versions/0f632f48bc6d_add_functions_history_table.py: -------------------------------------------------------------------------------- 1 | """add functions_history table 2 | 3 | Revision ID: 0f632f48bc6d 4 | Revises: e62d40842589 5 | Create Date: 2024-06-15 13:14:00.332058 6 | 7 | """ 8 | 9 | from typing import Sequence, Union 10 | 11 | from alembic import op 12 | import sqlalchemy as sa 13 | 14 | 15 | # revision identifiers, used by Alembic. 16 | revision: str = "0f632f48bc6d" 17 | down_revision: Union[str, None] = "e62d40842589" 18 | branch_labels: Union[str, Sequence[str], None] = None 19 | depends_on: Union[str, Sequence[str], None] = None 20 | 21 | 22 | def upgrade() -> None: 23 | op.create_table( 24 | "functions_history", 25 | sa.Column("id", sa.Integer, primary_key=True), 26 | sa.Column("interaction_id", sa.Integer, nullable=False), 27 | sa.Column("function", sa.String(255), nullable=False), 28 | sa.Column("user_input", sa.Text, nullable=False), 29 | sa.Column("answer", sa.Text, nullable=False), 30 | sa.Column( 31 | "created_at", sa.DateTime, nullable=False, server_default=sa.func.now() 32 | ), 33 | ) 34 | 35 | 36 | def downgrade() -> None: 37 | op.drop_table("functions_history") 38 | -------------------------------------------------------------------------------- /api/routes/login_view.py: -------------------------------------------------------------------------------- 1 | from flask import jsonify, request 2 | from flask.views import MethodView 3 | from werkzeug.security import check_password_hash 4 | from flask_jwt_extended import create_access_token 5 | from datetime import timedelta 6 | from api.database.models.user import User 7 | # from api.database.models.user_schema import UserSchema 8 | 9 | 10 | class LoginView(MethodView): 11 | 12 | def post(self): 13 | # user_schema = UserSchema() 14 | # users_schema = UserSchema(many=True) 15 | 16 | if request.is_json: 17 | email = request.json["email"] 18 | password = request.json["password"] 19 | else: 20 | email = request.form["email"] 21 | password = request.form["password"] 22 | 23 | user = User.query.filter_by(email=email).first() 24 | if user and check_password_hash(user.password_hash, password): 25 | expires = timedelta(minutes=30) 26 | access_token = create_access_token(identity=email, expires_delta=expires) 27 | return jsonify( 28 | message="Login Successful", 29 | access_token=access_token, 30 | expires_in=expires.total_seconds(), 31 | ) 32 | else: 33 | return jsonify("Bad email or Password"), 401 34 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | assistant_api: 3 | restart: on-failure 4 | build: 5 | context: ./ 6 | dockerfile: Dockerfile 7 | depends_on: 8 | - assistant_db 9 | environment: 10 | POSTGRES_URL: ${POSTGRES_URL} 11 | PYTHONPATH: /usr/src/app 12 | assistant_proxy: 13 | restart: on-failure 14 | build: 15 | context: ./nginx 16 | args: 17 | env: prod 18 | dockerfile: Dockerfile 19 | depends_on: 20 | - assistant_api 21 | assistant_db: 22 | image: postgres:latest 23 | restart: on-failure 24 | environment: 25 | POSTGRES_USER: ${POSTGRES_USER} 26 | POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} 27 | POSTGRES_DB: ${POSTGRES_DB} 28 | volumes: 29 | - postgres_data:/var/lib/postgresql/data/ 30 | assistant_qdrant: 31 | image: qdrant/qdrant:latest 32 | volumes: 33 | - qdrant_data:/qdrant/storage 34 | assistant_bot: 35 | restart: on-failure 36 | build: 37 | context: ./discord_bot 38 | dockerfile: Dockerfile 39 | volumes: 40 | - assistant_bot_data:/app/data 41 | assistant_test: 42 | build: 43 | context: ./ 44 | dockerfile: Dockerfile 45 | command: pytest tests/ 46 | environment: 47 | POSTGRES_URL: ${POSTGRES_URL} 48 | PYTHONPATH: /usr/src/app 49 | 50 | volumes: 51 | postgres_data: 52 | qdrant_data: 53 | assistant_bot_data: -------------------------------------------------------------------------------- /api/routes/check_english_view.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | from flask import jsonify, request 3 | from flask.views import MethodView 4 | from flask_jwt_extended import jwt_required 5 | from langchain_openai import ChatOpenAI 6 | from langchain_core.output_parsers import StrOutputParser 7 | from langchain_core.prompts import ChatPromptTemplate 8 | 9 | load_dotenv() 10 | 11 | llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) 12 | 13 | system_prompt = """ 14 | Adjust the user's text to rectify grammatical, spelling, and punctuation errors, maintaining the original layout. 15 | Interpret ambiguities with discernment. Overlook extraneous comments. 16 | Return only the rectified text! 17 | Examples: 18 | Q: wheres the best place to meet for a quick chat? 19 | A: Where's the best place to meet for a quick chat? 20 | Q: i cant believe its already been a year since we started this project! 21 | A: I can't believe it's already been a year since we started this project! 22 | ### 23 | User's text: {text} 24 | """ 25 | 26 | prompt = ChatPromptTemplate.from_template(system_prompt) 27 | output_parser = StrOutputParser() 28 | 29 | 30 | class CheckEnglishView(MethodView): 31 | decorators = [jwt_required()] 32 | 33 | def post(self): 34 | data = request.get_json() 35 | text = data.get("text", "") 36 | 37 | chain = prompt | llm | output_parser 38 | answer = chain.invoke({"text": text}) 39 | 40 | return jsonify({"text": answer}) 41 | -------------------------------------------------------------------------------- /alembic/versions/6409f4f8492b_add_resources_table.py: -------------------------------------------------------------------------------- 1 | """add resources table 2 | 3 | Revision ID: 6409f4f8492b 4 | Revises: 265ab9e632ed 5 | Create Date: 2024-06-15 13:17:26.673095 6 | 7 | """ 8 | 9 | from typing import Sequence, Union 10 | 11 | from alembic import op 12 | import sqlalchemy as sa 13 | 14 | 15 | # revision identifiers, used by Alembic. 16 | revision: str = "6409f4f8492b" 17 | down_revision: Union[str, None] = "265ab9e632ed" 18 | branch_labels: Union[str, Sequence[str], None] = None 19 | depends_on: Union[str, Sequence[str], None] = None 20 | 21 | 22 | def upgrade() -> None: 23 | op.create_table( 24 | "resources", 25 | sa.Column("id", sa.Integer, primary_key=True), 26 | sa.Column("name", sa.String(255), nullable=False), 27 | sa.Column("content", sa.Text, nullable=True), 28 | sa.Column("url", sa.String(255), nullable=True), 29 | sa.Column("tags", sa.String(255), nullable=True), 30 | sa.Column("category", sa.String(255), nullable=False), 31 | sa.Column("active", sa.Boolean, nullable=False, server_default="true"), 32 | sa.Column( 33 | "created_at", sa.DateTime, nullable=False, server_default=sa.func.now() 34 | ), 35 | sa.Column( 36 | "updated_at", 37 | sa.DateTime, 38 | nullable=False, 39 | server_default=sa.func.now(), 40 | onupdate=sa.func.now(), 41 | ), 42 | ) 43 | 44 | 45 | def downgrade() -> None: 46 | op.drop_table("resources") 47 | -------------------------------------------------------------------------------- /alembic/versions/265ab9e632ed_add_personal_memory_table.py: -------------------------------------------------------------------------------- 1 | """add personal_memory table 2 | 3 | Revision ID: 265ab9e632ed 4 | Revises: 0f632f48bc6d 5 | Create Date: 2024-06-15 13:15:35.943222 6 | 7 | """ 8 | 9 | from typing import Sequence, Union 10 | 11 | from alembic import op 12 | import sqlalchemy as sa 13 | 14 | 15 | # revision identifiers, used by Alembic. 16 | revision: str = "265ab9e632ed" 17 | down_revision: Union[str, None] = "0f632f48bc6d" 18 | branch_labels: Union[str, Sequence[str], None] = None 19 | depends_on: Union[str, Sequence[str], None] = None 20 | 21 | 22 | def upgrade() -> None: 23 | op.create_table( 24 | "personal_memory", 25 | sa.Column("id", sa.Integer, primary_key=True), 26 | sa.Column("name", sa.String(255), nullable=False), 27 | sa.Column("description", sa.Text, nullable=True), 28 | sa.Column("source", sa.String(255), nullable=True), 29 | sa.Column("category", sa.String(255), nullable=False), 30 | sa.Column("tags", sa.String(255), nullable=True), 31 | sa.Column( 32 | "created_at", sa.DateTime, nullable=False, server_default=sa.func.now() 33 | ), 34 | sa.Column( 35 | "updated_at", 36 | sa.DateTime, 37 | nullable=False, 38 | server_default=sa.func.now(), 39 | onupdate=sa.func.now(), 40 | ), 41 | sa.Column("active", sa.Boolean, nullable=False, server_default="true"), 42 | ) 43 | 44 | 45 | def downgrade() -> None: 46 | op.drop_table("personal_memory") 47 | -------------------------------------------------------------------------------- /alembic/versions/e62d40842589_add_knowledge_base_table.py: -------------------------------------------------------------------------------- 1 | """add knowledge_base table 2 | 3 | Revision ID: e62d40842589 4 | Revises: e3fab275bece 5 | Create Date: 2024-06-15 13:09:52.554254 6 | 7 | """ 8 | 9 | from typing import Sequence, Union 10 | 11 | from alembic import op 12 | import sqlalchemy as sa 13 | 14 | 15 | # revision identifiers, used by Alembic. 16 | revision: str = "e62d40842589" 17 | down_revision: Union[str, None] = "e3fab275bece" 18 | branch_labels: Union[str, Sequence[str], None] = None 19 | depends_on: Union[str, Sequence[str], None] = None 20 | 21 | 22 | def upgrade() -> None: 23 | op.create_table( 24 | "knowledge_base", 25 | sa.Column("id", sa.Integer, primary_key=True), 26 | sa.Column("category", sa.String(255), nullable=False), 27 | sa.Column("tag", sa.String(255), nullable=False), 28 | sa.Column("content", sa.Text, nullable=False), 29 | sa.Column("source", sa.String(255), nullable=True), 30 | sa.Column( 31 | "created_at", sa.DateTime, nullable=False, server_default=sa.func.now() 32 | ), 33 | sa.Column( 34 | "updated_at", 35 | sa.DateTime, 36 | nullable=False, 37 | server_default=sa.func.now(), 38 | onupdate=sa.func.now(), 39 | ), 40 | sa.Column("last_accessed_at", sa.DateTime, nullable=True), 41 | sa.Column("active", sa.Boolean, nullable=False, server_default="true"), 42 | ) 43 | 44 | 45 | def downgrade() -> None: 46 | op.drop_table("knowledge_base") 47 | -------------------------------------------------------------------------------- /docs/todo.md: -------------------------------------------------------------------------------- 1 | # To Do List/Backlog for the project 2 | - [x] project structure and initial setup 3 | - [x] dockerize the app 4 | - [x] add db migrations mechanism 5 | - [x] add unit tests 6 | - [x] add token-based authentication (JWT) 7 | - [x] add test user and login endpoint 8 | - [x] add langchain 9 | - [x] add qdrant 10 | - [x] create conversation bot foundation 11 | - [x] configure CI/CD - github actions 12 | - [x] rethink whole CI/CD workflow once again! 13 | - [x] add discord bot 14 | - [x] add basic option to talk with bot using model 15 | - [x] YT video summary in English 16 | - [] longterm memory and personalization RAG 17 | - [] add functionality to create bookmarks (yt-videos, articles, etc.) 18 | - [] add integration with apple watch using shortcuts to hit endpoints 19 | - [] google search endpoint 20 | - [] test diffrent models 21 | - [] anthropic claude/haiku 22 | - [] groq 23 | 24 | # Nice to have/do 25 | - [] change used library from request to aiohttp to allows async requests to improve performance 26 | - [] add types to the project 27 | - [x] add langsmith support 28 | - [] try what can be achieved with gpt-4o 29 | - [] refactor the code to avoid duplicates 30 | - [] play with agents approach 31 | - [] check cloudflare workers 32 | - [] check test containers (https://testcontainers.com/, https://www.docker.com/blog/local-development-of-go-applications-with-testcontainers/) 33 | - [] check idea of obsidian vault as knwoledge base 34 | 35 | # Bugs 36 | - [] fix issue with api not working after some time, no request are being processed and bot is throwning an error about missing summary in response. Add handling for such error and try to fix that issue 37 | - [] check why bot cointainer starts automatically when the docker engine is started 38 | - [] no login attempt before /commands, so there is error in case of no valid token 39 | -------------------------------------------------------------------------------- /tests/test_assistant_api.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from api.run import app 3 | 4 | 5 | @pytest.fixture 6 | def client(): 7 | app.config["TESTING"] = True 8 | with app.test_client() as client: 9 | yield client 10 | 11 | 12 | def login(client, email, password): 13 | data = {"email": email, "password": password} 14 | headers = {"Content-Type": "application/json"} 15 | return client.post("/login", json=data, headers=headers) 16 | 17 | 18 | def chat(client, message, headers): 19 | data = {"message": message} 20 | return client.post("/chat", json=data, headers=headers) 21 | 22 | # TODO: write a test without using the hardcoded credentials 23 | # def test_login_route(client): 24 | # response = login(client, "test@test.com", "test1") 25 | # assert response.status_code == 200 26 | # assert response.json["message"] == "Login Successful" 27 | # assert "access_token" in response.json 28 | # assert response.json["expires_in"] == 1800 29 | 30 | 31 | def test_login_route_with_incorrect_password(client): 32 | response = login(client, "wrong@email.com", "wrong_password") 33 | assert response.status_code == 401 34 | assert "access_token" not in response.json 35 | 36 | 37 | # def test_login_route_with_incorrect_email(client): 38 | # response = login(client, "test1@test.com", "test1") 39 | # assert response.status_code == 401 40 | # assert "access_token" not in response.json 41 | 42 | # def test_access_chat_route_with_valid_token(client): 43 | # login_response = login(client, "test@test.com", "test1") 44 | # valid_token = login_response.json["access_token"] 45 | # headers = { 46 | # "Content-Type": "application/json", 47 | # "Authorization": f"Bearer {valid_token}", 48 | # } 49 | # chat_response = chat(client, "hello", headers) 50 | # assert chat_response.status_code == 200 51 | -------------------------------------------------------------------------------- /docs/jupyter_notebooks/ollama.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# source: https://github.com/ollama/ollama/blob/main/docs/tutorials/langchainpy.md\n", 10 | "\n", 11 | "from langchain_community.llms import Ollama\n", 12 | "from langchain_community.document_loaders import WebBaseLoader\n", 13 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 14 | "from langchain_community.embeddings import OllamaEmbeddings\n", 15 | "from langchain_community.vectorstores import Chroma\n", 16 | "from langchain.chains import RetrievalQA\n", 17 | "\n", 18 | "ollama = Ollama(base_url='http://localhost:11434', model=\"llama2\")\n", 19 | "\n", 20 | "loader = WebBaseLoader(\"https://www.gutenberg.org/files/1727/1727-h/1727-h.htm\")\n", 21 | "data = loader.load()\n", 22 | "\n", 23 | "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", 24 | "all_splits = text_splitter.split_documents(data)\n", 25 | "\n", 26 | "oembed = OllamaEmbeddings(base_url=\"http://localhost:11434\", model=\"nomic-embed-text\")\n", 27 | "# store the emeddings locally - maybe qdrant?\n", 28 | "vectorstore = Chroma.from_documents(documents=all_splits, embedding=oembed)\n", 29 | "\n", 30 | "question = \"Who is Neleus and who is in Neleus' family?\"\n", 31 | "print(question)\n", 32 | "docs = vectorstore.similarity_search(question)\n", 33 | "\n", 34 | "qachain = RetrievalQA.from_chain_type(ollama, retriever=vectorstore.as_retriever())\n", 35 | "print(qachain.invoke({\"query\": question}))" 36 | ] 37 | } 38 | ], 39 | "metadata": { 40 | "kernelspec": { 41 | "display_name": "jupyter", 42 | "language": "python", 43 | "name": "python3" 44 | }, 45 | "language_info": { 46 | "name": "python", 47 | "version": "3.11.7" 48 | } 49 | }, 50 | "nbformat": 4, 51 | "nbformat_minor": 2 52 | } 53 | -------------------------------------------------------------------------------- /discord_bot/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | 5 | class Config: 6 | def __init__(self): 7 | self._api_token = None 8 | self._api_token_expires_at = None 9 | self._current_conversation_id = None 10 | self._current_conversation_last_message_timestamp = None 11 | self.discord_token = os.getenv("DISCORD_TOKEN") 12 | self.discord_guild_id = os.getenv("DISCORD_GUILD_ID") 13 | # chat, chat-testing 14 | self.chatting_channels_ids = [1238228569021349948, 1238223813997756446] 15 | 16 | if not self.discord_token: 17 | raise ValueError("DISCORD_TOKEN is not set") 18 | 19 | if not self.discord_guild_id: 20 | raise ValueError("DISCORD_GUILD_ID is not set") 21 | 22 | @property 23 | def api_token(self): 24 | return self._api_token 25 | 26 | @api_token.setter 27 | def api_token(self, value): 28 | self._api_token = value 29 | 30 | @property 31 | def api_token_expires_at(self): 32 | return self._api_token_expires_at 33 | 34 | @api_token_expires_at.setter 35 | def api_token_expires_at(self, value): 36 | self._api_token_expires_at = value 37 | 38 | def is_token_valid(self): 39 | return ( 40 | self.api_token 41 | and self.api_token_expires_at 42 | and self.api_token_expires_at > datetime.now() 43 | ) 44 | 45 | @property 46 | def current_conversation_id(self): 47 | return self._current_conversation_id 48 | 49 | @current_conversation_id.setter 50 | def current_conversation_id(self, value): 51 | self._current_conversation_id = value 52 | 53 | @property 54 | def current_conversation_last_message_timestamp(self): 55 | return self._current_conversation_last_message_timestamp 56 | 57 | @current_conversation_last_message_timestamp.setter 58 | def current_conversation_last_message_timestamp(self, value): 59 | self._current_conversation_last_message_timestamp = value 60 | -------------------------------------------------------------------------------- /docs/feature_ideas.md: -------------------------------------------------------------------------------- 1 | # Assistant feature ideas 2 | Assistant should be availiable through various interfaces. 3 | The main one would be discord bot - a server with multiple channels for diffrent purposes. 4 | Additionally it should be availiable through voice assistant - using watch/phone commands + shortcuts 5 | 6 | ## Basic functionalities 7 | - conversation bot - chat like experience, ask about anything 8 | - enrich the anwers with google search - from default or executed on demand 9 | - brave https://brave.com/search/api/ 10 | - duckduckgo 11 | - serp api 12 | - long term memory - use information from the knowledge base and info about user to enrich prompts to make answers it more accurate 13 | - config file with the basic info about the user to give the context 14 | - feature to save info to long term memory 15 | - feature to retrieve info from long term memory 16 | - useful functions (chat modes) 17 | - prompts that can be used in various situations eg. correct grammar, wording, translate into diffrent languages 18 | - prompts for creating code - https://qdrant.tech/documentation/tutorials/code-search/ 19 | - create a day summary based on the calendar events 20 | - remind about diffrent things based on the created events 21 | - feature to create reminder - remind me about sending that email in 30 minutes 22 | - feature to create recurring reminders - remind me to stand up every 30 minutes 23 | - save notes/quotes from books, articles, etc. 24 | - save notes using chat or voice 25 | - save it with tags, so it can be easily categorized and then retrieved 26 | - daily summary (readwise like) - send a selected quote or note from the list and present it in the chat 27 | - feature to retrieve notes using tags 28 | 29 | ### Feature ideas for the future 30 | - basic app instead of discord server - streamlit app? 31 | - mode for creative ideas discussion, brainstorming, problem solving - agent like? 32 | - help creating notes on content consumed (books, articles, videos, podcasts, films, series etc.) 33 | - summarize yt video 34 | - summarize podcast or article -------------------------------------------------------------------------------- /docs/jupyter_notebooks/web_summary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from langchain_community.document_loaders import WebBaseLoader\n", 10 | "\n", 11 | "URL = \"https://www.kalzumeus.com/2011/10/28/dont-call-yourself-a-programmer/\"\n", 12 | "\n", 13 | "JINA_PREFIX = \"https://r.jina.ai/\"\n", 14 | "\n", 15 | "loader = WebBaseLoader(URL)\n", 16 | "data = loader.load()" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 20, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import requests\n", 26 | "import json\n", 27 | "\n", 28 | "response = requests.get(JINA_PREFIX + URL, headers={\"Accept\": \"application/json\"})\n", 29 | "response_json = json.loads(response.text)\n", 30 | "response_json_content = response_json[\"data\"][\"content\"]\n", 31 | "metadata = {\"title\": response_json[\"data\"][\"title\"],\n", 32 | " \"url\": response_json[\"data\"][\"url\"]}" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 22, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "{'title': \"Don't Call Yourself A Programmer, And Other Career Advice\", 'url': 'https://www.kalzumeus.com/2011/10/28/dont-call-yourself-a-programmer/'}\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "# print(response.text)\n", 50 | "print(metadata)\n", 51 | "# print(data)" 52 | ] 53 | } 54 | ], 55 | "metadata": { 56 | "kernelspec": { 57 | "display_name": "jupyter", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 3 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython3", 71 | "version": "3.11.7" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 2 76 | } 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

logo

2 | 3 | # Personal Assistant 4 | A virtual personal assistant designed to simplify your daily tasks. The assistant can be accessed through the Command Line Interface (CLI) or as a Discord bot. With the assistance of Generative AI, it automates daily activities, making your life more organized and easier. 5 | 6 | Project built for the [100 commitów](https://100commitow.pl/) competition. Its goal is to develop an open source project for 100 days. 7 | 8 | ## Technologies 9 | [![Technologies](https://skillicons.dev/icons?i=py,flask,postgres,docker,githubactions)](https://skillicons.dev) 10 | 11 | ## Architecture diagram 12 |

architecture diagram

13 | 14 | ## How to run? 15 | You only need to have docker and docker-compose installed on your machine. Then you can run the following commands: 16 | 1. Clone the repository```git clone https://github.com/janbanot/personal_assistant.git``` 17 | 2. Create an image with python virtual environment and install all the dependencies```bash env.sh``` 18 | 3. Build the project ```bash build.sh``` 19 | - Build the project using local docker-compose settings ```bash build.sh --local``` 20 | 21 | ## Current state of the project 22 | ### Project architecture 23 | - [x] Dockerized environment 24 | - [x] CI/CD pipeline 25 | - [x] test framework 26 | - [x] the first basic tests are implemented. 27 | 28 | ### API 29 | - [x] token-based authentication and login process 30 | - [x] endpoint to communicate with the assistant 31 | - [x] endpoint to get YT video summary 32 | - [x] endpoint to get page summary 33 | 34 | ### Discord bot 35 | - [x] bot configuration 36 | - [x] command to communicate with the assistant 37 | - [x] command to get YT video summary 38 | - [x] command to get page summary 39 | - [x] command to fix English text 40 | 41 | ### Next steps: 42 | - Improve chatbot functionalities 43 | - Add more commands with prompts covering frequent use cases 44 | - Add memory to the chatbot (information about the user, context of the conversation, possibility to save data) 45 | 46 | ## Read more about the project 47 | - [feature ideas](docs/feature_ideas.md) 48 | - [todo list](docs/todo.md) 49 | -------------------------------------------------------------------------------- /api/routes/yt_summary_view.py: -------------------------------------------------------------------------------- 1 | from flask import request, jsonify, current_app 2 | from flask.views import MethodView 3 | from flask_jwt_extended import jwt_required 4 | from dotenv import load_dotenv 5 | from langchain_core.prompts import PromptTemplate 6 | from langchain_community.document_loaders import YoutubeLoader 7 | from langchain_openai import ChatOpenAI 8 | from langchain.chains.summarize import load_summarize_chain 9 | from langchain.text_splitter import RecursiveCharacterTextSplitter 10 | 11 | load_dotenv() 12 | 13 | 14 | class YTSummaryView(MethodView): 15 | decorators = [jwt_required()] 16 | 17 | def post(self): 18 | data = request.get_json() 19 | url = data.get("url", "") 20 | 21 | current_app.logger.info("Request: %s", request) 22 | current_app.logger.info("URL: %s", url) 23 | 24 | try: 25 | loader = YoutubeLoader.from_youtube_url(url, add_video_info=True) 26 | except Exception as e: 27 | print(f"Invalid YouTube URL: {url}. Error: {e}") 28 | results = loader.load() 29 | 30 | llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) 31 | 32 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=500) 33 | 34 | for document in results: 35 | text_content = document.page_content 36 | 37 | docs = text_splitter.create_documents([text_content]) 38 | 39 | map_prompt = """ 40 | Write a concise summary of the following: 41 | "{text}" 42 | CONCISE SUMMARY: 43 | """ 44 | 45 | map_prompt_template = PromptTemplate( 46 | template=map_prompt, input_variables=["text"] 47 | ) 48 | 49 | summary_combine_prompt = """" 50 | Write detailed and comprehensive summary of the video transcript text. 51 | The summary should cover the main points and key details of the text. 52 | Return your response in bullet points. 53 | ```{text}``` 54 | BULLET POINT SUMMARY: 55 | """ 56 | 57 | summary_combine_prompt_template = PromptTemplate( 58 | template=summary_combine_prompt, input_variables=["text"] 59 | ) 60 | 61 | summary_chain = load_summarize_chain( 62 | llm=llm, 63 | chain_type="map_reduce", 64 | map_prompt=map_prompt_template, 65 | combine_prompt=summary_combine_prompt_template, 66 | # verbose=True 67 | ) 68 | 69 | summary_output = summary_chain.run(docs) 70 | 71 | # TODO: add option to ask question about the video, to extend a point from summary, etc. 72 | return jsonify({"summary": summary_output}) 73 | -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | from flask import Flask 5 | from api.extensions import db, ma, jwt 6 | from api.routes.login_view import LoginView 7 | from api.routes.chat_view import ChatView 8 | from api.routes.clear_context_view import ClearView 9 | from api.routes.test_view import TestView 10 | from api.routes.yt_summary_view import YTSummaryView 11 | from api.routes.check_english_view import CheckEnglishView 12 | from api.routes.web_page_summary_view import WebPageSummaryView 13 | from api.routes.db_conversation_id_view import DBConversationIdView 14 | 15 | 16 | def create_app(): 17 | app = Flask(__name__) 18 | SECRET_KEY = os.getenv("FLASK_SECRET_KEY", "this is a secret key") 19 | app.secret_key = SECRET_KEY 20 | app.config["SQLALCHEMY_DATABASE_URI"] = os.getenv("POSTGRES_URL", "") 21 | 22 | # Initialize the instances with the app 23 | db.init_app(app) 24 | ma.init_app(app) 25 | jwt.init_app(app) 26 | 27 | app.add_url_rule( 28 | "/", 29 | view_func=TestView.as_view("test_view"), 30 | methods=[ 31 | "GET", 32 | ], 33 | ) 34 | app.add_url_rule( 35 | "/login", 36 | view_func=LoginView.as_view("login_view"), 37 | methods=[ 38 | "POST", 39 | ], 40 | ) 41 | app.add_url_rule( 42 | "/chat", 43 | view_func=ChatView.as_view("chat_view"), 44 | methods=[ 45 | "POST", 46 | ], 47 | ) 48 | app.add_url_rule( 49 | "/clear-context", 50 | view_func=ClearView.as_view("clear_view"), 51 | methods=[ 52 | "POST", 53 | ], 54 | ) 55 | app.add_url_rule( 56 | "/yt-summary", 57 | view_func=YTSummaryView.as_view("yt_summary_view"), 58 | methods=[ 59 | "POST", 60 | ], 61 | ) 62 | app.add_url_rule( 63 | "/check-english", 64 | view_func=CheckEnglishView.as_view("check_english_view"), 65 | methods=[ 66 | "POST", 67 | ], 68 | ) 69 | app.add_url_rule( 70 | "/page-summary", 71 | view_func=WebPageSummaryView.as_view("page_summary_view"), 72 | methods=[ 73 | "POST", 74 | ], 75 | ) 76 | app.add_url_rule( 77 | "/db/conversation-id", 78 | view_func=DBConversationIdView.as_view("db_conversation_id_view"), 79 | methods=[ 80 | "GET", 81 | ], 82 | ) 83 | 84 | # Configure logging 85 | logging.basicConfig( 86 | level=logging.DEBUG, handlers=[logging.StreamHandler(sys.stdout)] 87 | ) 88 | app.logger.info("Starting application") 89 | 90 | return app 91 | -------------------------------------------------------------------------------- /alembic/env.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | from sqlalchemy import engine_from_config 4 | from sqlalchemy import pool 5 | 6 | from alembic import context 7 | 8 | from dotenv import load_dotenv 9 | 10 | import os 11 | 12 | load_dotenv() 13 | 14 | # this is the Alembic Config object, which provides 15 | # access to the values within the .ini file in use. 16 | config = context.config 17 | 18 | # Replace the sqlalchemy.url value with the one from .env file 19 | database_url = os.getenv('POSTGRES_URL') 20 | 21 | if not database_url: 22 | raise ValueError('POSTGRES_URL environment variable not set') 23 | 24 | config.set_main_option('sqlalchemy.url', database_url) 25 | 26 | # Interpret the config file for Python logging. 27 | # This line sets up loggers basically. 28 | if config.config_file_name is not None: 29 | fileConfig(config.config_file_name) 30 | 31 | # add your model's MetaData object here 32 | # for 'autogenerate' support 33 | # from myapp import mymodel 34 | # target_metadata = mymodel.Base.metadata 35 | target_metadata = None # type: ignore 36 | 37 | # other values from the config, defined by the needs of env.py, 38 | # can be acquired: 39 | # my_important_option = config.get_main_option("my_important_option") 40 | # ... etc. 41 | 42 | 43 | def run_migrations_offline() -> None: 44 | """Run migrations in 'offline' mode. 45 | 46 | This configures the context with just a URL 47 | and not an Engine, though an Engine is acceptable 48 | here as well. By skipping the Engine creation 49 | we don't even need a DBAPI to be available. 50 | 51 | Calls to context.execute() here emit the given string to the 52 | script output. 53 | 54 | """ 55 | url = config.get_main_option("sqlalchemy.url") 56 | context.configure( 57 | url=url, 58 | target_metadata=target_metadata, 59 | literal_binds=True, 60 | dialect_opts={"paramstyle": "named"}, 61 | ) 62 | 63 | with context.begin_transaction(): 64 | context.run_migrations() 65 | 66 | 67 | def run_migrations_online() -> None: 68 | """Run migrations in 'online' mode. 69 | 70 | In this scenario we need to create an Engine 71 | and associate a connection with the context. 72 | 73 | """ 74 | connectable = engine_from_config( 75 | config.get_section(config.config_ini_section, {}), 76 | prefix="sqlalchemy.", 77 | poolclass=pool.NullPool, 78 | ) 79 | 80 | with connectable.connect() as connection: 81 | context.configure( 82 | connection=connection, target_metadata=target_metadata 83 | ) 84 | 85 | with context.begin_transaction(): 86 | context.run_migrations() 87 | 88 | 89 | if context.is_offline_mode(): 90 | run_migrations_offline() 91 | else: 92 | run_migrations_online() 93 | -------------------------------------------------------------------------------- /api/routes/web_page_summary_view.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from flask import request, jsonify, current_app 4 | from flask.views import MethodView 5 | from flask_jwt_extended import jwt_required 6 | from dotenv import load_dotenv 7 | from langchain_openai import ChatOpenAI 8 | from langchain_core.prompts import PromptTemplate 9 | from langchain.text_splitter import RecursiveCharacterTextSplitter 10 | from langchain.chains.summarize import load_summarize_chain 11 | 12 | load_dotenv() 13 | 14 | 15 | class WebPageSummaryView(MethodView): 16 | decorators = [jwt_required()] 17 | 18 | def post(self): 19 | JINA_PREFIX = "https://r.jina.ai/" 20 | 21 | data = request.get_json() 22 | url = data.get("url", "") 23 | 24 | current_app.logger.info("Request: %s", request) 25 | current_app.logger.info("URL: %s", url) 26 | 27 | try: 28 | response = requests.get(JINA_PREFIX + url, headers={"Accept": "application/json"}) 29 | except Exception as e: 30 | print(f"Invalid URL: {url}. Error: {e}") 31 | 32 | response_json = json.loads(response.text) 33 | text_content = response_json["data"]["content"] 34 | # metadata = {"title": response_json["data"]["title"], "url": response_json["data"]["url"]} 35 | 36 | llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) 37 | 38 | text_splitter = RecursiveCharacterTextSplitter( 39 | separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500 40 | ) 41 | 42 | docs = text_splitter.create_documents([text_content]) 43 | 44 | # TODO: rewrite to share code with yt_summary_view 45 | map_prompt = """ 46 | Write a concise summary of the following: 47 | "{text}" 48 | CONCISE SUMMARY: 49 | """ 50 | 51 | map_prompt_template = PromptTemplate( 52 | template=map_prompt, input_variables=["text"] 53 | ) 54 | 55 | summary_combine_prompt = """" 56 | Write detailed and comprehensive summary of the article. 57 | The summary should cover the main points and key details of the text. 58 | Return your response in bullet points. 59 | ```{text}``` 60 | BULLET POINT SUMMARY: 61 | """ 62 | 63 | summary_combine_prompt_template = PromptTemplate( 64 | template=summary_combine_prompt, input_variables=["text"] 65 | ) 66 | 67 | summary_chain = load_summarize_chain( 68 | llm=llm, 69 | chain_type="map_reduce", 70 | map_prompt=map_prompt_template, 71 | combine_prompt=summary_combine_prompt_template, 72 | # verbose=True 73 | ) 74 | 75 | summary_output = summary_chain.run(docs) 76 | 77 | # TODO: add option to ask question about the text, to extend a point from summary, etc. 78 | return jsonify({"summary": summary_output}) 79 | -------------------------------------------------------------------------------- /api/routes/chat_view.py: -------------------------------------------------------------------------------- 1 | from flask import request, jsonify 2 | from flask.views import MethodView 3 | from flask_jwt_extended import jwt_required 4 | from dotenv import load_dotenv 5 | from datetime import datetime 6 | from dateutil import tz # type: ignore 7 | from langchain_openai import ChatOpenAI 8 | from langchain.prompts.prompt import PromptTemplate 9 | from langchain.chains import ConversationChain 10 | from langchain.memory import ConversationSummaryMemory 11 | from api.database.db_manager import DBManager 12 | 13 | # TODO: try to implement it without using the langchain directly 14 | load_dotenv() 15 | 16 | context_memory = ConversationSummaryMemory(llm=ChatOpenAI(), ai_prefix="Assistant") 17 | # TODO: refactor to use the same instance of the model 18 | llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) 19 | 20 | current_datetime = datetime.now() 21 | gmt_plus_2_tz = tz.gettz('Etc/GMT-2') 22 | current_datetime = current_datetime.astimezone(gmt_plus_2_tz) 23 | current_datetime_str = current_datetime.strftime('%Y/%m/%d, %H:%M:%S') 24 | 25 | datetime_string = f"Current datetime: {current_datetime_str} \n" 26 | 27 | template = datetime_string + """ 28 | You are an AI assistant designed for ultra-concise, engaging conversations. 29 | Follow these rules: 30 | - Use the fewest words possible while maintaining clarity, impact and natural language 31 | - Keep a friendly, casual tone with occasional colloquialisms 32 | - Always wrap code with triple backticks and keywords with `single backticks` 33 | - Ask for clarification to avoid assumptions 34 | - Detect intentions and emotional states to tailor responses perfectly. 35 | - Focus solely on instructions and provide relevant, comprehensive responses 36 | - Never repeat info or mention limitations 37 | - Simplify complex tasks; provide the best output possible 38 | - Prioritize user needs; tailor responses to their context and goals 39 | - When asked for specific content, start response with requested info immediately 40 | - Continuously improve based on user feedback 41 | 42 | Current conversation: 43 | {history} 44 | Human: {input} 45 | AI Assistant: 46 | """ 47 | 48 | # TODO: Check why langchain default template example is sent in the context with every message? 49 | PROMPT = PromptTemplate(input_variables=["history", "input"], template=template) 50 | 51 | conversation = ConversationChain( 52 | prompt=PROMPT, 53 | llm=llm, 54 | verbose=True, 55 | memory=context_memory, 56 | ) 57 | 58 | 59 | class ChatView(MethodView): 60 | decorators = [jwt_required()] 61 | 62 | def __init__(self): 63 | self.db_manager = DBManager() 64 | 65 | def post(self): 66 | data = request.get_json() 67 | input_text = data.get("message", "") 68 | conversation_id = data.get("conversation_id", "") 69 | 70 | current_context = context_memory.buffer 71 | 72 | result = conversation.predict(input=input_text) 73 | 74 | self.db_manager.save_message(conversation_id, input_text, current_context, result) 75 | 76 | return jsonify({"message": result}) 77 | -------------------------------------------------------------------------------- /docs/notes.md: -------------------------------------------------------------------------------- 1 | # Useful developmnet notes 2 | ### TODO 3 | change to use https://github.com/casey/just 4 | change to use password as variable that is set in the first command and then used in the next ones 5 | 6 | ## DB 7 | - connect to db 8 | ```bash 9 | psql postgresql://USER:PASSWORD@assistant_db:5432/assistant_db 10 | ``` 11 | 12 | ## CURLs 13 | - login 14 | ```bash 15 | curl -X POST -H "Content-Type: application/json" -d '{"email": "test@test.com", "password": "test1"}' http://localhost:8081/login 16 | ``` 17 | 18 | - endpoint with token 19 | ```bash 20 | curl -X GET -H "Authorization: Bearer ABC" http://localhost:8081/ 21 | ``` 22 | 23 | - chat 24 | ```bash 25 | curl -X POST -H "Authorization: Bearer ABC" -H "Content-Type: application/json" -d '{"message": "Hello", "conversation_id": -1}' http://localhost:8081/chat 26 | ``` 27 | 28 | - save token as variable and use it later 29 | ```bash 30 | TOKEN=$(curl -s -X POST -H "Content-Type: application/json" -d '{"email": "test@test.com", "password": "test1"}' http://localhost:8081/login | jq -r '.access_token') 31 | 32 | curl -X POST -H "Authorization: Bearer $TOKEN" -H "Content-Type: application/json" -d '{"message": "Hello", "conversation_id": -1}' http://localhost:8081/chat 33 | ``` 34 | 35 | - yt_summary endpoint 36 | ```bash 37 | curl -X POST -H "Authorization: Bearer $TOKEN" -H "Content-Type: application/json" -d '{"url": "https://www.youtube.com/watch?v=YEJUUB1LNFM"}' http://localhost:8081/yt-summary 38 | ``` 39 | 40 | - check_english endpoint 41 | ```bash 42 | curl -X POST -H "Authorization: Bearer $TOKEN" -H "Content-Type: application/json" -d '{"text": "My name is Susan. Im fourteen and I live in Germany. My hobbies are going to discos, sometimes I listen to music on the radio. In the summer, I go swimming in a lake. I dont have any brothers or sisters. We take buses to school. Im in year 9 at my school. My birthday is on Friday. I hope I will get a new guitar."}' http://localhost:8081/check-english 43 | ``` 44 | 45 | ## Debugging 46 | - debug in docker contianer 47 | - add debugpy fragment to the code 48 | ``` python 49 | import debugpy 50 | debugpy.listen(("0.0.0.0", 5678)) 51 | debugpy.wait_for_client() 52 | ``` 53 | - in docker-compose expose port 5678 54 | ```yaml 55 | services: 56 | your-service: 57 | ports: 58 | - "5678:5678" 59 | ``` 60 | - modify lanuch.json config 61 | ```json 62 | { 63 | "version": "0.2.0", 64 | "configurations": [ 65 | { 66 | "name": "Python: Remote Attach", 67 | "type": "python", 68 | "request": "attach", 69 | "connect": { 70 | "host": "localhost", 71 | "port": 5678 72 | }, 73 | "pathMappings": [ 74 | { 75 | "localRoot": "${workspaceFolder}", 76 | "remoteRoot": "/app" 77 | } 78 | ] 79 | } 80 | ] 81 | } 82 | ``` 83 | - add breakpoint in the code 84 | ```python 85 | debugpy.breakpoint() 86 | ``` 87 | - run docker-compose 88 | - attach debugger in VSCode -------------------------------------------------------------------------------- /alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = alembic 6 | 7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s 8 | # Uncomment the line below if you want the files to be prepended with date and time 9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file 10 | # for all available tokens 11 | # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s 12 | 13 | # sys.path path, will be prepended to sys.path if present. 14 | # defaults to the current working directory. 15 | prepend_sys_path = . 16 | 17 | # timezone to use when rendering the date within the migration file 18 | # as well as the filename. 19 | # If specified, requires the python>=3.9 or backports.zoneinfo library. 20 | # Any required deps can installed by adding `alembic[tz]` to the pip requirements 21 | # string value is passed to ZoneInfo() 22 | # leave blank for localtime 23 | # timezone = 24 | 25 | # max length of characters to apply to the 26 | # "slug" field 27 | # truncate_slug_length = 40 28 | 29 | # set to 'true' to run the environment during 30 | # the 'revision' command, regardless of autogenerate 31 | # revision_environment = false 32 | 33 | # set to 'true' to allow .pyc and .pyo files without 34 | # a source .py file to be detected as revisions in the 35 | # versions/ directory 36 | # sourceless = false 37 | 38 | # version location specification; This defaults 39 | # to alembic/versions. When using multiple version 40 | # directories, initial revisions must be specified with --version-path. 41 | # The path separator used here should be the separator specified by "version_path_separator" below. 42 | # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions 43 | 44 | # version path separator; As mentioned above, this is the character used to split 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. 47 | # Valid values for version_path_separator are: 48 | # 49 | # version_path_separator = : 50 | # version_path_separator = ; 51 | # version_path_separator = space 52 | version_path_separator = os # Use os.pathsep. Default configuration used for new projects. 53 | 54 | # set to 'true' to search source files recursively 55 | # in each "version_locations" directory 56 | # new in Alembic version 1.10 57 | # recursive_version_locations = false 58 | 59 | # the output encoding used when revision files 60 | # are written from script.py.mako 61 | # output_encoding = utf-8 62 | 63 | sqlalchemy.url = ${POSTGRES_URL} 64 | 65 | 66 | [post_write_hooks] 67 | # post_write_hooks defines scripts or Python functions that are run 68 | # on newly generated revision scripts. See the documentation for further 69 | # detail and examples 70 | 71 | # format using "black" - use the console_scripts runner, against the "black" entrypoint 72 | # hooks = black 73 | # black.type = console_scripts 74 | # black.entrypoint = black 75 | # black.options = -l 79 REVISION_SCRIPT_FILENAME 76 | 77 | # lint with attempts to fix using "ruff" - use the exec runner, execute a binary 78 | # hooks = ruff 79 | # ruff.type = exec 80 | # ruff.executable = %(here)s/.venv/bin/ruff 81 | # ruff.options = --fix REVISION_SCRIPT_FILENAME 82 | 83 | # Logging configuration 84 | [loggers] 85 | keys = root,sqlalchemy,alembic 86 | 87 | [handlers] 88 | keys = console 89 | 90 | [formatters] 91 | keys = generic 92 | 93 | [logger_root] 94 | level = WARN 95 | handlers = console 96 | qualname = 97 | 98 | [logger_sqlalchemy] 99 | level = WARN 100 | handlers = 101 | qualname = sqlalchemy.engine 102 | 103 | [logger_alembic] 104 | level = INFO 105 | handlers = 106 | qualname = alembic 107 | 108 | [handler_console] 109 | class = StreamHandler 110 | args = (sys.stderr,) 111 | level = NOTSET 112 | formatter = generic 113 | 114 | [formatter_generic] 115 | format = %(levelname)-5.5s [%(name)s] %(message)s 116 | datefmt = %H:%M:%S 117 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.3 2 | aiosignal==1.3.1 3 | alembic==1.13.1 4 | annotated-types==0.6.0 5 | anyio==4.3.0 6 | appnope==0.1.4 7 | asgiref==3.8.1 8 | asttokens==2.4.1 9 | asyncio==3.4.3 10 | attrs==23.2.0 11 | backoff==2.2.1 12 | bcrypt==4.1.2 13 | beautifulsoup4==4.12.3 14 | blinker==1.7.0 15 | build==1.2.1 16 | cachetools==5.3.3 17 | certifi==2024.2.2 18 | charset-normalizer==3.3.2 19 | chroma-hnswlib==0.7.3 20 | chromadb==0.4.24 21 | click==8.1.7 22 | coloredlogs==15.0.1 23 | comm==0.2.2 24 | dataclasses-json==0.6.4 25 | debugpy==1.8.1 26 | decorator==5.1.1 27 | Deprecated==1.2.14 28 | discord.py==2.3.2 29 | distro==1.9.0 30 | executing==2.0.1 31 | fastapi==0.110.1 32 | filelock==3.13.4 33 | flake8==7.0.0 34 | Flask==3.0.2 35 | Flask-JWT-Extended==4.6.0 36 | flask-marshmallow==1.2.0 37 | Flask-SQLAlchemy==3.1.1 38 | flatbuffers==24.3.25 39 | frozenlist==1.4.1 40 | fsspec==2024.3.1 41 | google-auth==2.29.0 42 | googleapis-common-protos==1.63.0 43 | grpcio==1.62.0 44 | grpcio-tools==1.62.0 45 | gunicorn==21.2.0 46 | h11==0.14.0 47 | h2==4.1.0 48 | hpack==4.0.0 49 | httpcore==1.0.4 50 | httptools==0.6.1 51 | httpx==0.27.0 52 | huggingface-hub==0.22.2 53 | humanfriendly==10.0 54 | hyperframe==6.0.1 55 | idna==3.6 56 | importlib-metadata==7.0.0 57 | importlib_resources==6.4.0 58 | iniconfig==2.0.0 59 | ipython==8.23.0 60 | itsdangerous==2.1.2 61 | jedi==0.19.1 62 | Jinja2==3.1.3 63 | jsonpatch==1.33 64 | jsonpointer==2.4 65 | jupyter_client==8.6.1 66 | jupyter_core==5.7.2 67 | kubernetes==29.0.0 68 | langchain==0.1.16 69 | langchain-community==0.0.32 70 | langchain-core==0.1.42 71 | langchain-openai==0.1.3 72 | langchain-text-splitters==0.0.1 73 | langchainhub==0.1.15 74 | langsmith==0.1.47 75 | Mako==1.3.2 76 | markdown-it-py==3.0.0 77 | MarkupSafe==2.1.5 78 | marshmallow==3.21.0 79 | marshmallow-sqlalchemy==1.0.0 80 | matplotlib-inline==0.1.6 81 | mccabe==0.7.0 82 | mdurl==0.1.2 83 | mmh3==4.1.0 84 | monotonic==1.6 85 | mpmath==1.3.0 86 | multidict==6.0.5 87 | mypy==1.8.0 88 | mypy-extensions==1.0.0 89 | nest-asyncio==1.6.0 90 | numpy==1.26.4 91 | oauthlib==3.2.2 92 | onnxruntime==1.17.3 93 | openai==1.17.1 94 | opentelemetry-api==1.24.0 95 | opentelemetry-exporter-otlp-proto-common==1.24.0 96 | opentelemetry-exporter-otlp-proto-grpc==1.24.0 97 | opentelemetry-instrumentation==0.45b0 98 | opentelemetry-instrumentation-asgi==0.45b0 99 | opentelemetry-instrumentation-fastapi==0.45b0 100 | opentelemetry-proto==1.24.0 101 | opentelemetry-sdk==1.24.0 102 | opentelemetry-semantic-conventions==0.45b0 103 | opentelemetry-util-http==0.45b0 104 | orjson==3.10.0 105 | overrides==7.7.0 106 | packaging==23.2 107 | parso==0.8.4 108 | pexpect==4.9.0 109 | platformdirs==4.2.0 110 | pluggy==1.4.0 111 | portalocker==2.8.2 112 | posthog==3.5.0 113 | prompt-toolkit==3.0.43 114 | protobuf==4.25.3 115 | psutil==5.9.8 116 | psycopg2==2.9.9 117 | ptyprocess==0.7.0 118 | pulsar-client==3.4.0 119 | pure-eval==0.2.2 120 | pyasn1==0.6.0 121 | pyasn1_modules==0.4.0 122 | pycodestyle==2.11.1 123 | pydantic==2.6.3 124 | pydantic_core==2.16.3 125 | pyflakes==3.2.0 126 | Pygments==2.17.2 127 | PyJWT==2.8.0 128 | PyPika==0.48.9 129 | pyproject_hooks==1.0.0 130 | pytest==8.0.2 131 | python-dateutil==2.9.0.post0 132 | python-dotenv==1.0.1 133 | pytube==15.0.0 134 | PyYAML==6.0.1 135 | pyzmq==25.1.2 136 | qdrant-client==1.8.0 137 | regex==2023.12.25 138 | requests==2.31.0 139 | requests-oauthlib==2.0.0 140 | rich==13.7.1 141 | rsa==4.9 142 | shellingham==1.5.4 143 | six==1.16.0 144 | sniffio==1.3.1 145 | soupsieve==2.5 146 | SQLAlchemy==2.0.27 147 | stack-data==0.6.3 148 | starlette==0.37.2 149 | sympy==1.12 150 | tenacity==8.2.3 151 | tiktoken==0.5.2 152 | tokenizers==0.15.2 153 | tornado==6.4 154 | tqdm==4.66.2 155 | traitlets==5.14.2 156 | typer==0.12.3 157 | types-requests==2.31.0.20240218 158 | typing-inspect==0.9.0 159 | typing_extensions==4.10.0 160 | urllib3==2.2.1 161 | uvicorn==0.29.0 162 | uvloop==0.19.0 163 | watchfiles==0.21.0 164 | wcwidth==0.2.13 165 | websocket-client==1.7.0 166 | websockets==12.0 167 | Werkzeug==3.0.1 168 | wrapt==1.16.0 169 | yarl==1.9.4 170 | youtube-transcript-api==0.6.2 171 | zipp==3.18.1 172 | -------------------------------------------------------------------------------- /.github/workflows/basic.yml: -------------------------------------------------------------------------------- 1 | name: Basic Workflow 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | 7 | jobs: 8 | setup: 9 | runs-on: self-hosted 10 | environment: production 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Check for changes in requirements.txt 16 | id: check 17 | run: | 18 | if git diff --name-only ${{ github.event.before }} ${{ github.sha }} | grep '^requirements.txt$'; then 19 | echo "deps_changed=true" >> $GITHUB_ENV 20 | else 21 | echo "deps_changed=false" >> $GITHUB_ENV 22 | fi 23 | 24 | - name: Run Env 25 | run: bash env.sh 26 | if: env.deps_changed == 'true' 27 | 28 | build: 29 | needs: setup 30 | runs-on: self-hosted 31 | environment: production 32 | 33 | steps: 34 | - uses: actions/checkout@v4 35 | 36 | - name: Create .env files 37 | run: | 38 | echo "FLASK_SECRET_KEY=${{ secrets.FLASK_SECRET_KEY }}" > .env 39 | echo "FLASK_DEBUG_MODE=${{ secrets.FLASK_DEBUG_MODE }}" >> .env 40 | echo "POSTGRES_URL=${{ secrets.POSTGRES_URL }}" >> .env 41 | echo "POSTGRES_USER=${{ secrets.POSTGRES_USER }}" >> .env 42 | echo "POSTGRES_PASSWORD=${{ secrets.POSTGRES_PASSWORD }}" >> .env 43 | echo "POSTGRES_DB=${{ secrets.POSTGRES_DB }}" >> .env 44 | echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> .env 45 | echo "LANGCHAIN_API_KEY=${{ secrets.LANGCHAIN_API_KEY }}" >> .env 46 | echo "LANGCHAIN_TRACING_V2=${{ secrets.LANGCHAIN_TRACING_V2 }}" >> .env 47 | echo "LANGCHAIN_PROJECT=${{ secrets.LANGCHAIN_PROJECT }}" >> .env 48 | echo "LANGCHAIN_ENDPOINT=${{ secrets.LANGCHAIN_ENDPOINT }}" >> .env 49 | mkdir -p ./discord_bot/ 50 | echo "DISCORD_TOKEN=${{ secrets.DISCORD_TOKEN }}" > ./discord_bot/.env 51 | echo "DISCORD_GUILD_ID=${{ secrets.DISCORD_GUILD_ID }}" >> ./discord_bot/.env 52 | echo "API_USER_EMAIL=${{ secrets.API_USER_EMAIL }}" >> ./discord_bot/.env 53 | echo "API_PASSWORD=${{ secrets.API_PASSWORD }}" >> ./discord_bot/.env 54 | echo "API_URL=${{ secrets.API_URL }}" >> ./discord_bot/.env 55 | 56 | - name: Run Build 57 | run: | 58 | export GITHUB_RUN_ID=${{ github.run_id }} 59 | bash build.sh 60 | 61 | test: 62 | needs: build 63 | runs-on: self-hosted 64 | environment: production 65 | 66 | steps: 67 | - uses: actions/checkout@v4 68 | 69 | - name: Create .env files 70 | run: | 71 | echo "FLASK_SECRET_KEY=${{ secrets.FLASK_SECRET_KEY }}" > .env 72 | echo "FLASK_DEBUG_MODE=${{ secrets.FLASK_DEBUG_MODE }}" >> .env 73 | echo "POSTGRES_URL=${{ secrets.POSTGRES_URL }}" >> .env 74 | echo "POSTGRES_USER=${{ secrets.POSTGRES_USER }}" >> .env 75 | echo "POSTGRES_PASSWORD=${{ secrets.POSTGRES_PASSWORD }}" >> .env 76 | echo "POSTGRES_DB=${{ secrets.POSTGRES_DB }}" >> .env 77 | echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> .env 78 | echo "LANGCHAIN_API_KEY=${{ secrets.LANGCHAIN_API_KEY }}" >> .env 79 | echo "LANGCHAIN_TRACING_V2=${{ secrets.LANGCHAIN_TRACING_V2 }}" >> .env 80 | echo "LANGCHAIN_PROJECT=${{ secrets.LANGCHAIN_PROJECT }}" >> .env 81 | echo "LANGCHAIN_ENDPOINT=${{ secrets.LANGCHAIN_ENDPOINT }}" >> .env 82 | mkdir -p ./discord_bot/ 83 | echo "DISCORD_TOKEN=${{ secrets.DISCORD_TOKEN }}" > ./discord_bot/.env 84 | echo "DISCORD_GUILD_ID=${{ secrets.DISCORD_GUILD_ID }}" >> ./discord_bot/.env 85 | echo "API_USER_EMAIL=${{ secrets.API_USER_EMAIL }}" >> ./discord_bot/.env 86 | echo "API_PASSWORD=${{ secrets.API_PASSWORD }}" >> ./discord_bot/.env 87 | echo "API_URL=${{ secrets.API_URL }}" >> ./discord_bot/.env 88 | 89 | - name: Run Tests 90 | id: tests 91 | run: | 92 | docker-compose up --build --exit-code-from assistant_test assistant_test 93 | 94 | - name: Cleanup Docker images from that run 95 | if: ${{ failure() }} 96 | run: | 97 | export GITHUB_RUN_ID=${{ github.run_id }} 98 | docker stop $(docker ps -a -q -f "label=workflow=$GITHUB_RUN_ID") 99 | docker rm $(docker ps -a -q -f "label=workflow=$GITHUB_RUN_ID") 100 | 101 | - name: Cleanup Test Container 102 | if: always() 103 | run: | 104 | if [ "$(docker ps -a -q -f name=assistant_test)" ]; then 105 | docker-compose stop assistant_test 106 | docker-compose rm -f assistant_test 107 | fi 108 | 109 | - name: Cleanup Unused Images 110 | if: always() 111 | run: docker image prune -f 112 | -------------------------------------------------------------------------------- /discord_bot/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import json 4 | from datetime import datetime, timedelta 5 | from dotenv import load_dotenv 6 | 7 | # TODO: refactor the file, because all theses will be simmilar, maybe there is a pattern for this 8 | # TODO: change to use aiohttp instead of requests, so discord command can utilize async 9 | 10 | load_dotenv() 11 | 12 | # TODO: move to config object? 13 | EMAIL = os.getenv("API_USER_EMAIL") 14 | PASSWORD = os.getenv("API_PASSWORD") 15 | URL = os.getenv("API_URL") 16 | 17 | 18 | def get_valid_token(config): 19 | if not config.is_token_valid(): 20 | login(config) 21 | return config.is_token_valid() 22 | 23 | 24 | def update_conversation_timestamp(config): 25 | config.current_conversation_last_message_timestamp = datetime.now() 26 | 27 | 28 | def is_new_conversation(config): 29 | if config.current_conversation_last_message_timestamp is None: 30 | return True 31 | else: 32 | time_since_last_message = datetime.now() - config.current_conversation_last_message_timestamp 33 | return time_since_last_message > timedelta(minutes=10) 34 | 35 | 36 | def conversation_context_handler(config, force_clear=False): 37 | if force_clear: 38 | config.current_conversation_id += 1 39 | update_conversation_timestamp(config) 40 | return clear_context(config) 41 | elif not config.current_conversation_id: 42 | config.current_conversation_id = 1 43 | update_conversation_timestamp(config) 44 | elif is_new_conversation(config): 45 | config.current_conversation_id += 1 46 | update_conversation_timestamp(config) 47 | clear_context(config) 48 | else: 49 | update_conversation_timestamp(config) 50 | 51 | return None 52 | 53 | 54 | # TODO: change request to aiohttp 55 | def login(config): 56 | url = URL + "login" 57 | headers = {"Content-Type": "application/json"} 58 | data = {"email": EMAIL, "password": PASSWORD} 59 | response = requests.post(url, headers=headers, json=data) 60 | if response.status_code == 200: 61 | response_data = response.json() 62 | config.api_token = response_data.get("access_token") 63 | expires_in = response_data.get("expires_in") 64 | config.api_token_expires_at = datetime.now() + timedelta(seconds=expires_in) 65 | else: 66 | config.api_token = None 67 | config.api_token_expires_at = None 68 | 69 | 70 | def hello_world(config): 71 | auth_header = {"Authorization": f"Bearer {config.api_token}"} 72 | response = requests.get(URL, headers=auth_header) 73 | return response.json() 74 | 75 | 76 | def chat(config, message): 77 | headers = { 78 | "Authorization": f"Bearer {config.api_token}", 79 | "Content-Type": "application/json", 80 | } 81 | url = URL + "chat" 82 | data = {"message": message, "conversation_id": config.current_conversation_id} 83 | response = requests.post(url, headers=headers, data=json.dumps(data)) 84 | return response.json()["message"] 85 | 86 | 87 | # TODO: refactor to get rid of duplicated code 88 | def clear_context(config): 89 | headers = {"Authorization": f"Bearer {config.api_token}"} 90 | url = URL + "clear-context" 91 | response = requests.post(url, headers=headers) 92 | return response.json()["message"] 93 | 94 | 95 | def yt_summary(config, video_url): 96 | headers = { 97 | "Authorization": f"Bearer {config.api_token}", 98 | "Content-Type": "application/json", 99 | } 100 | url = URL + "yt-summary" 101 | data = {"url": video_url} 102 | response = requests.post(url, headers=headers, data=json.dumps(data)) 103 | return response.json()["summary"] 104 | 105 | 106 | def page_summary(config, page_url): 107 | headers = { 108 | "Authorization": f"Bearer {config.api_token}", 109 | "Content-Type": "application/json", 110 | } 111 | url = URL + "page-summary" 112 | data = {"url": page_url} 113 | response = requests.post(url, headers=headers, data=json.dumps(data)) 114 | return response.json()["summary"] 115 | 116 | 117 | def check_english(config, text): 118 | headers = { 119 | "Authorization": f"Bearer {config.api_token}", 120 | "Content-Type": "application/json", 121 | } 122 | url = URL + "check-english" 123 | data = {"text": text} 124 | response = requests.post(url, headers=headers, data=json.dumps(data)) 125 | return response.json()["text"] 126 | 127 | 128 | def get_conversation_id(config): 129 | headers = { 130 | "Authorization": f"Bearer {config.api_token}" 131 | } 132 | url = URL + "db/conversation-id" 133 | response = requests.get(url, headers=headers) 134 | return response.json()["conversation_id"] 135 | -------------------------------------------------------------------------------- /discord_bot/assistant_bot.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | import discord 4 | from discord.ext import commands 5 | from dotenv import load_dotenv 6 | import asyncio 7 | from concurrent.futures import ThreadPoolExecutor 8 | from config import Config 9 | from utils import ( 10 | login, 11 | get_valid_token, 12 | chat, 13 | yt_summary, 14 | page_summary, 15 | check_english, 16 | conversation_context_handler, 17 | get_conversation_id 18 | ) 19 | from bot_commands import BotCommands, get_bot_commands 20 | from datetime import datetime, timedelta 21 | 22 | load_dotenv() 23 | 24 | config = Config() 25 | 26 | MY_GUILD = discord.Object(id=config.discord_guild_id) 27 | 28 | intents = discord.Intents.default() 29 | intents.message_content = True 30 | 31 | bot = commands.Bot(command_prefix="/", intents=intents) 32 | 33 | logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler(sys.stdout)]) 34 | 35 | 36 | @bot.event 37 | async def on_ready(): 38 | login(config) 39 | # TODO: does it make any sense? (conversation_id endpoint, updating it in config after initialization, etc.) 40 | conversation_id = get_conversation_id(config) 41 | config.current_conversation_id = conversation_id 42 | print(f"{bot.user} has connected to Discord!") 43 | 44 | 45 | @bot.event 46 | async def on_message(message): 47 | if message.author == bot.user or message.author.bot: 48 | return 49 | 50 | if message.channel.id in config.chatting_channels_ids: 51 | await handle_bot_chatting(message) 52 | 53 | await bot.process_commands(message) 54 | 55 | 56 | async def handle_bot_chatting(message): 57 | if get_valid_token(config): 58 | if message.content.startswith("!clear"): 59 | response = conversation_context_handler(config, force_clear=True) 60 | else: 61 | conversation_context_handler(config) 62 | response = chat(config, message.content) 63 | await message.channel.send(response) 64 | else: 65 | await message.channel.send("Could not get API token") 66 | 67 | 68 | @bot.command(name="sync", description="Sync commands tree commands") 69 | async def sync_command(ctx: commands.Context): 70 | bot.tree.copy_global_to(guild=MY_GUILD) 71 | await bot.tree.sync(guild=MY_GUILD) 72 | await ctx.send("Commands synced!") 73 | 74 | 75 | @bot.command(name="get_messages", description="Get messages from a channel") 76 | async def get_messages(ctx: commands.Context, channel_id: int, days: int): 77 | channel = bot.get_channel(channel_id) 78 | if not channel: 79 | await ctx.send("Channel not found.") 80 | return 81 | 82 | end_date = datetime.utcnow() 83 | start_date = end_date - timedelta(days=days) 84 | 85 | messages = [] 86 | async for message in channel.history(limit=None, after=start_date, before=end_date): # type: ignore 87 | messages.append(f"{message.created_at}: {message.author}: {message.content}") 88 | 89 | await ctx.send(f"Retrieved {len(messages)} messages from the last {days} days.") 90 | await ctx.send(f"First message: {messages[0]}") 91 | await ctx.send(f"Last message: {messages[-1]}") 92 | 93 | 94 | # TODO: refactor to utilize async and remove duplicated code 95 | # TODO: !!!! fix problem with unauthorized error on commands 96 | # handle 401 errors + add handling for errors in api so in bot we can display anything 97 | # add option to ask more questions about the video based on the content 98 | @bot.command( 99 | name=BotCommands.YT_SUMMARY.value.name, 100 | description=BotCommands.YT_SUMMARY.value.description, 101 | ) 102 | async def yt_summary_command(ctx: commands.Context, url: str): 103 | loop = asyncio.get_event_loop() 104 | with ThreadPoolExecutor() as pool: 105 | summary = await loop.run_in_executor(pool, yt_summary, config, url) 106 | await ctx.send(summary) 107 | 108 | 109 | @bot.command( 110 | name=BotCommands.PAGE_SUMMARY.value.name, 111 | description=BotCommands.PAGE_SUMMARY.value.description, 112 | ) 113 | async def page_summary_command(ctx: commands.Context, url: str): 114 | loop = asyncio.get_event_loop() 115 | with ThreadPoolExecutor() as pool: 116 | summary = await loop.run_in_executor(pool, page_summary, config, url) 117 | await ctx.send(summary) 118 | 119 | 120 | @bot.command( 121 | name=BotCommands.CHECK_ENGLISH.value.name, 122 | description=BotCommands.CHECK_ENGLISH.value.description, 123 | ) 124 | async def check_english_command(ctx: commands.Context, *, input_text: str): 125 | fixed_text = check_english(config, input_text) 126 | await ctx.send(fixed_text) 127 | 128 | 129 | @bot.tree.command( 130 | name=BotCommands.LIST_COMMANDS.value.name, 131 | description="Get a list of all available commands", 132 | ) 133 | async def list_all_commands(interaction: discord.Interaction) -> None: 134 | commands_list = str(get_bot_commands()) 135 | await interaction.response.send_message(commands_list) 136 | 137 | 138 | bot.run(config.discord_token) 139 | -------------------------------------------------------------------------------- /docs/jupyter_notebooks/yt_video_summary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# YT video summary playground\n", 8 | "Playground for testing YT english video summary generation" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 168, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import os\n", 18 | "from dotenv import load_dotenv\n", 19 | "from langchain_community.document_loaders import YoutubeLoader\n", 20 | "from langchain_openai import ChatOpenAI\n", 21 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 22 | "from langchain.chains.summarize import load_summarize_chain\n", 23 | "\n", 24 | "load_dotenv()\n", 25 | "openai_key = os.getenv(\"OPENAI_API_KEY\")\n", 26 | "\n", 27 | "# url = \"https://youtu.be/ThnVAgHzsLg?si=4s8wBcvXrfDPEiRn\"\n", 28 | "url = \"https://www.youtube.com/watch?v=Hkgz1ysv9Fk\"\n", 29 | "# url = \"https://www.youtube.com/watch?v=f9_BWhCI4Zo\"\n", 30 | "# url = \"https://www.youtube.com/watch?v=8OJC21T2SL4\"\n", 31 | "loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)\n", 32 | "results = loader.load()\n", 33 | "\n", 34 | "llm = ChatOpenAI(model=\"gpt-3.5-turbo\", temperature=0, openai_api_key=openai_key)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 169, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "2308" 46 | ] 47 | }, 48 | "execution_count": 169, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "for document in results:\n", 55 | " text_content = document.page_content\n", 56 | "\n", 57 | "llm.get_num_tokens(text_content)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 170, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "Now we have 2 documents and the first one has 2169 tokens\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "from langchain import OpenAI\n", 75 | "from langchain.chains.summarize import load_summarize_chain\n", 76 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 77 | "\n", 78 | "text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=500)\n", 79 | "\n", 80 | "docs = text_splitter.create_documents([text_content])\n", 81 | "\n", 82 | "num_docs = len(docs)\n", 83 | "\n", 84 | "num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content)\n", 85 | "\n", 86 | "print (f\"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens\")" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 171, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "from langchain import PromptTemplate\n", 96 | "\n", 97 | "map_prompt = \"\"\"\n", 98 | "Write a concise summary of the following:\n", 99 | "\"{text}\"\n", 100 | "CONCISE SUMMARY:\n", 101 | "\"\"\"\n", 102 | "map_prompt_template = PromptTemplate(template=map_prompt, input_variables=[\"text\"])\n", 103 | "\n", 104 | "summary_combine_prompt = \"\"\"\"\n", 105 | "Write detailed and comprehensive summary of the video transcript text.\n", 106 | "The summary should cover the main points and key details of the text.\n", 107 | "Return your response in bullet points.\n", 108 | "```{text}```\n", 109 | "BULLET POINT SUMMARY:\n", 110 | "\"\"\"\n", 111 | "summary_combine_prompt_template = PromptTemplate(template=summary_combine_prompt, input_variables=[\"text\"])" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 172, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "summary_chain = load_summarize_chain(llm=llm,\n", 121 | " chain_type='map_reduce',\n", 122 | " map_prompt=map_prompt_template,\n", 123 | " combine_prompt=summary_combine_prompt_template,\n", 124 | "# verbose=True\n", 125 | " )" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 173, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "output = summary_chain.run(docs)\n" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "print (output)" 144 | ] 145 | } 146 | ], 147 | "metadata": { 148 | "kernelspec": { 149 | "display_name": "personal_assistant", 150 | "language": "python", 151 | "name": "python3" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 3 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython3", 163 | "version": "3.11.7" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 2 168 | } 169 | -------------------------------------------------------------------------------- /docs/jupyter_notebooks/document_rag.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 18, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "ImportError", 10 | "evalue": "cannot import name 'create_model' from 'langchain_core.runnables.utils' (/Users/janbanot/.pyenv/versions/3.11.7/envs/jupyter/lib/python3.11/site-packages/langchain_core/runnables/utils.py)", 11 | "output_type": "error", 12 | "traceback": [ 13 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 14 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 15 | "Cell \u001b[0;32mIn[18], line 9\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_openai\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m OpenAIEmbeddings\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_openai\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChatOpenAI\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m hub\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01moutput_parsers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m StrOutputParser\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrunnables\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RunnablePassthrough\n", 16 | "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/jupyter/lib/python3.11/site-packages/langchain/hub.py:10\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mload\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdump\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dumps\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mload\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mload\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m loads\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mprompts\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BasePromptTemplate\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m TYPE_CHECKING:\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchainhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Client\n", 17 | "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/jupyter/lib/python3.11/site-packages/langchain_core/prompts/__init__.py:27\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;124;03m\"\"\"**Prompt** is the input to the model.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \n\u001b[1;32m 3\u001b[0m \u001b[38;5;124;03mPrompt is often constructed\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 25\u001b[0m \n\u001b[1;32m 26\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m \u001b[38;5;66;03m# noqa: E501\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mprompts\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 28\u001b[0m BasePromptTemplate,\n\u001b[1;32m 29\u001b[0m aformat_document,\n\u001b[1;32m 30\u001b[0m format_document,\n\u001b[1;32m 31\u001b[0m )\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mprompts\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mchat\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 33\u001b[0m AIMessagePromptTemplate,\n\u001b[1;32m 34\u001b[0m BaseChatPromptTemplate,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 39\u001b[0m SystemMessagePromptTemplate,\n\u001b[1;32m 40\u001b[0m )\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mprompts\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfew_shot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 42\u001b[0m FewShotChatMessagePromptTemplate,\n\u001b[1;32m 43\u001b[0m FewShotPromptTemplate,\n\u001b[1;32m 44\u001b[0m )\n", 18 | "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/jupyter/lib/python3.11/site-packages/langchain_core/prompts/base.py:31\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrunnables\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RunnableConfig, RunnableSerializable\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrunnables\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ensure_config\n\u001b[0;32m---> 31\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrunnables\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m create_model\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m TYPE_CHECKING:\n\u001b[1;32m 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocuments\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Document\n", 19 | "\u001b[0;31mImportError\u001b[0m: cannot import name 'create_model' from 'langchain_core.runnables.utils' (/Users/janbanot/.pyenv/versions/3.11.7/envs/jupyter/lib/python3.11/site-packages/langchain_core/runnables/utils.py)" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "import os\n", 25 | "from dotenv import load_dotenv\n", 26 | "import bs4\n", 27 | "from langchain_community.document_loaders import WebBaseLoader\n", 28 | "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", 29 | "from langchain_chroma import Chroma\n", 30 | "from langchain_openai import OpenAIEmbeddings\n", 31 | "from langchain_openai import ChatOpenAI\n", 32 | "from langchain import hub\n", 33 | "from langchain_core.output_parsers import StrOutputParser\n", 34 | "from langchain_core.runnables import RunnablePassthrough\n", 35 | "\n", 36 | "load_dotenv()\n", 37 | "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", 38 | "\n", 39 | "# Only keep post title, headers, and content from the full HTML.\n", 40 | "bs4_strainer = bs4.SoupStrainer(class_=(\"post-title\", \"post-header\", \"post-content\"))\n", 41 | "loader = WebBaseLoader(\n", 42 | " web_paths=(\"https://lilianweng.github.io/posts/2023-06-23-agent/\",),\n", 43 | " bs_kwargs={\"parse_only\": bs4_strainer},\n", 44 | ")\n", 45 | "docs = loader.load()\n", 46 | "\n", 47 | "text_splitter = RecursiveCharacterTextSplitter(\n", 48 | " chunk_size=1000, chunk_overlap=200, add_start_index=True\n", 49 | ")\n", 50 | "all_splits = text_splitter.split_documents(docs)\n", 51 | "\n", 52 | "vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())\n", 53 | "\n", 54 | "retriever = vectorstore.as_retriever(search_type=\"similarity\", search_kwargs={\"k\": 6})\n", 55 | "\n", 56 | "retrieved_docs = retriever.invoke(\"What are the approaches to Task Decomposition?\")\n", 57 | "\n", 58 | "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0125\")\n", 59 | "\n", 60 | "prompt = hub.pull(\"rlm/rag-prompt\")\n", 61 | "\n", 62 | "example_messages = prompt.invoke(\n", 63 | " {\"context\": \"filler context\", \"question\": \"filler question\"}\n", 64 | ").to_messages()\n", 65 | "\n", 66 | "def format_docs(docs):\n", 67 | " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", 68 | "\n", 69 | "\n", 70 | "rag_chain = (\n", 71 | " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", 72 | " | prompt\n", 73 | " | llm\n", 74 | " | StrOutputParser()\n", 75 | ")\n", 76 | "\n", 77 | "for chunk in rag_chain.stream(\"What is Task Decomposition?\"):\n", 78 | " print(chunk, end=\"\", flush=True)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "{'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/',\n", 90 | " 'start_index': 7056}" 91 | ] 92 | }, 93 | "execution_count": 10, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "all_splits[10].metadata" 100 | ] 101 | } 102 | ], 103 | "metadata": { 104 | "kernelspec": { 105 | "display_name": "personal_assistant", 106 | "language": "python", 107 | "name": "python3" 108 | }, 109 | "language_info": { 110 | "codemirror_mode": { 111 | "name": "ipython", 112 | "version": 3 113 | }, 114 | "file_extension": ".py", 115 | "mimetype": "text/x-python", 116 | "name": "python", 117 | "nbconvert_exporter": "python", 118 | "pygments_lexer": "ipython3", 119 | "version": "3.11.7" 120 | } 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 2 124 | } 125 | --------------------------------------------------------------------------------