├── .env.example ├── .github └── workflows │ ├── cd.yaml │ └── ci.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── .vscode └── settings.json ├── Dockerfile ├── LICENSE ├── README.md ├── code_snippets ├── 03_custom_odm_example.py ├── 03_orm.py ├── 08_instructor_embeddings.py ├── 08_text_embeddings.py └── 08_text_image_embeddings.py ├── configs ├── digital_data_etl_maxime_labonne.yaml ├── digital_data_etl_paul_iusztin.yaml ├── end_to_end_data.yaml ├── evaluating.yaml ├── export_artifact_to_json.yaml ├── feature_engineering.yaml ├── generate_instruct_datasets.yaml ├── generate_preference_datasets.yaml └── training.yaml ├── data ├── artifacts │ ├── cleaned_documents.json │ ├── instruct_datasets.json │ ├── preference_datasets.json │ └── raw_documents.json └── data_warehouse_raw_data │ ├── ArticleDocument.json │ ├── PostDocument.json │ ├── RepositoryDocument.json │ └── UserDocument.json ├── docker-compose.yml ├── images ├── cover_plus.png └── crazy_cat.jpg ├── llm_engineering ├── __init__.py ├── application │ ├── __init__.py │ ├── crawlers │ │ ├── __init__.py │ │ ├── base.py │ │ ├── custom_article.py │ │ ├── dispatcher.py │ │ ├── github.py │ │ ├── linkedin.py │ │ └── medium.py │ ├── dataset │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── generation.py │ │ ├── output_parsers.py │ │ └── utils.py │ ├── networks │ │ ├── __init__.py │ │ ├── base.py │ │ └── embeddings.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── chunking_data_handlers.py │ │ ├── cleaning_data_handlers.py │ │ ├── dispatchers.py │ │ ├── embedding_data_handlers.py │ │ └── operations │ │ │ ├── __init__.py │ │ │ ├── chunking.py │ │ │ └── cleaning.py │ ├── rag │ │ ├── __init__.py │ │ ├── base.py │ │ ├── prompt_templates.py │ │ ├── query_expanison.py │ │ ├── reranking.py │ │ ├── retriever.py │ │ └── self_query.py │ └── utils │ │ ├── __init__.py │ │ ├── misc.py │ │ └── split_user_full_name.py ├── domain │ ├── __init__.py │ ├── base │ │ ├── __init__.py │ │ ├── nosql.py │ │ └── vector.py │ ├── chunks.py │ ├── cleaned_documents.py │ ├── dataset.py │ ├── documents.py │ ├── embedded_chunks.py │ ├── exceptions.py │ ├── inference.py │ ├── prompt.py │ ├── queries.py │ └── types.py ├── infrastructure │ ├── __init__.py │ ├── aws │ │ ├── __init__.py │ │ ├── deploy │ │ │ ├── __init__.py │ │ │ ├── autoscaling_sagemaker_endpoint.py │ │ │ ├── delete_sagemaker_endpoint.py │ │ │ └── huggingface │ │ │ │ ├── __init__.py │ │ │ │ ├── config.py │ │ │ │ ├── run.py │ │ │ │ └── sagemaker_huggingface.py │ │ └── roles │ │ │ ├── create_execution_role.py │ │ │ └── create_sagemaker_role.py │ ├── db │ │ ├── mongo.py │ │ └── qdrant.py │ ├── files_io.py │ ├── inference_pipeline_api.py │ └── opik_utils.py ├── model │ ├── Readme.md │ ├── __init__.py │ ├── evaluation │ │ ├── __init__.py │ │ ├── evaluate.py │ │ ├── requirements.txt │ │ └── sagemaker.py │ ├── finetuning │ │ ├── __init__.py │ │ ├── finetune.py │ │ ├── requirements.txt │ │ └── sagemaker.py │ ├── inference │ │ ├── __init__.py │ │ ├── inference.py │ │ ├── run.py │ │ └── test.py │ └── utils.py └── settings.py ├── pipelines ├── __init__.py ├── digital_data_etl.py ├── end_to_end_data.py ├── evaluating.py ├── export_artifact_to_json.py ├── feature_engineering.py ├── generate_datasets.py └── training.py ├── poetry.lock ├── pyproject.toml ├── ruff.toml ├── steps ├── __init__.py ├── etl │ ├── __init__.py │ ├── crawl_links.py │ └── get_or_create_user.py ├── evaluating │ ├── __init__.py │ └── evaluate.py ├── export │ ├── __init__.py │ ├── serialize_artifact.py │ └── to_json.py ├── feature_engineering │ ├── __init__.py │ ├── clean.py │ ├── load_to_vector_db.py │ ├── query_data_warehouse.py │ └── rag.py ├── generate_datasets │ ├── __init__.py │ ├── create_prompts.py │ ├── generate_intruction_dataset.py │ ├── generate_preference_dataset.py │ ├── push_to_huggingface.py │ └── query_feature_store.py └── training │ ├── __init__.py │ └── train.py ├── tests ├── __init__.py ├── integration │ ├── __init__.py │ └── integration_example_test.py └── unit │ ├── __init__.py │ └── unit_example_test.py └── tools ├── __init__.py ├── data_warehouse.py ├── ml_service.py ├── rag.py └── run.py /.env.example: -------------------------------------------------------------------------------- 1 | # --- Required settings even when working locally. --- 2 | 3 | # OpenAI API Config 4 | OPENAI_MODEL_ID=gpt-4o-mini 5 | OPENAI_API_KEY=str 6 | 7 | # Huggingface API Config 8 | HUGGINGFACE_ACCESS_TOKEN=str 9 | 10 | # Comet ML (during training and inference) 11 | COMET_API_KEY=str 12 | 13 | # --- Required settings when deploying the code. --- 14 | # --- Otherwise, default values work fine. --- 15 | 16 | # MongoDB database 17 | DATABASE_HOST="mongodb://llm_engineering:llm_engineering@127.0.0.1:27017" 18 | 19 | # Qdrant vector database 20 | USE_QDRANT_CLOUD=false 21 | QDRANT_CLOUD_URL=str 22 | QDRANT_APIKEY=str 23 | 24 | # AWS Authentication 25 | AWS_ARN_ROLE=str 26 | AWS_REGION=eu-central-1 27 | AWS_ACCESS_KEY=str 28 | AWS_SECRET_KEY=str 29 | -------------------------------------------------------------------------------- /.github/workflows/cd.yaml: -------------------------------------------------------------------------------- 1 | name: CD 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | concurrency: 9 | group: ${{ github.workflow }}-${{ github.ref }} 10 | cancel-in-progress: true 11 | 12 | jobs: 13 | build: 14 | name: Build & Push Docker Image 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout Code 18 | uses: actions/checkout@v3 19 | 20 | - name: Set up Docker Buildx 21 | uses: docker/setup-buildx-action@v3 22 | 23 | - name: Configure AWS credentials 24 | uses: aws-actions/configure-aws-credentials@v1 25 | with: 26 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 27 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 28 | aws-region: ${{ secrets.AWS_REGION }} 29 | 30 | - name: Login to Amazon ECR 31 | id: login-ecr 32 | uses: aws-actions/amazon-ecr-login@v1 33 | 34 | - name: Build images & push to ECR 35 | id: build-image 36 | uses: docker/build-push-action@v6 37 | with: 38 | context: . 39 | file: ./Dockerfile 40 | tags: | 41 | ${{ steps.login-ecr.outputs.registry }}/${{ secrets.AWS_ECR_NAME }}:${{ github.sha }} 42 | ${{ steps.login-ecr.outputs.registry }}/${{ secrets.AWS_ECR_NAME }}:latest 43 | push: true 44 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | 6 | concurrency: 7 | group: ${{ github.workflow }}-${{ github.ref }} 8 | cancel-in-progress: true 9 | 10 | jobs: 11 | qa: 12 | name: QA 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v3 18 | 19 | - name: Setup Python 20 | uses: actions/setup-python@v3 21 | with: 22 | python-version: "3.11" 23 | 24 | - name: Install poetry 25 | uses: abatilo/actions-poetry@v2 26 | with: 27 | poetry-version: 1.8.3 28 | 29 | - name: Install packages 30 | run: | 31 | poetry install --only dev 32 | poetry self add 'poethepoet[poetry_plugin]' 33 | 34 | - name: gitleaks check 35 | run: poetry poe gitleaks-check 36 | 37 | - name: Lint check [Python] 38 | run: poetry poe lint-check 39 | 40 | - name: Format check [Python] 41 | run: poetry poe format-check 42 | 43 | test: 44 | name: Test 45 | runs-on: ubuntu-latest 46 | 47 | steps: 48 | - name: Checkout 49 | uses: actions/checkout@v3 50 | 51 | - name: Setup Python 52 | uses: actions/setup-python@v3 53 | with: 54 | python-version: "3.11" 55 | 56 | - name: Install poetry 57 | uses: abatilo/actions-poetry@v2 58 | with: 59 | poetry-version: 1.8.3 60 | 61 | - name: Install packages 62 | run: | 63 | poetry install 64 | poetry self add 'poethepoet[poetry_plugin]' 65 | 66 | - name: Run tests 67 | run: | 68 | echo "Running tests..." 69 | poetry poe test 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # IDEs 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | # MacOs 165 | .DS_Store 166 | 167 | # VS Code 168 | .vscode/**/launch.json 169 | 170 | # Data 171 | output/ 172 | sagemaker_*.json 173 | run_ids.txt 174 | 175 | # Virtual environments 176 | *_venv 177 | *_myenv 178 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.3.5 4 | hooks: 5 | - id: ruff # Run the linter. 6 | - id: ruff-format # Run the formatter. 7 | - repo: https://github.com/gitleaks/gitleaks 8 | rev: v8.18.2 9 | hooks: 10 | - id: gitleaks 11 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11.8 2 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.formatOnSave": true, 4 | "editor.codeActionsOnSave": { 5 | "source.fixAll": "explicit", 6 | "source.organizeImports": "explicit" 7 | }, 8 | "editor.defaultFormatter": "charliermarsh.ruff" 9 | }, 10 | "notebook.formatOnSave.enabled": true, 11 | "notebook.codeActionsOnSave": { 12 | "notebook.source.fixAll": "explicit", 13 | "notebook.source.organizeImports": "explicit" 14 | }, 15 | } -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim-bullseye AS release 2 | 3 | ENV WORKSPACE_ROOT=/app/ 4 | ENV PYTHONDONTWRITEBYTECODE=1 5 | ENV PYTHONUNBUFFERED=1 6 | ENV POETRY_VERSION=1.8.3 7 | ENV DEBIAN_FRONTEND=noninteractive 8 | ENV POETRY_NO_INTERACTION=1 9 | 10 | # Install Google Chrome 11 | RUN apt-get update -y && \ 12 | apt-get install -y gnupg wget curl --no-install-recommends && \ 13 | wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /usr/share/keyrings/google-linux-signing-key.gpg && \ 14 | echo "deb [signed-by=/usr/share/keyrings/google-linux-signing-key.gpg] https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \ 15 | apt-get update -y && \ 16 | apt-get install -y google-chrome-stable && \ 17 | rm -rf /var/lib/apt/lists/* 18 | 19 | # Install other system dependencies. 20 | RUN apt-get update -y \ 21 | && apt-get install -y --no-install-recommends build-essential \ 22 | gcc \ 23 | python3-dev \ 24 | build-essential \ 25 | libglib2.0-dev \ 26 | libnss3-dev \ 27 | && apt-get clean \ 28 | && rm -rf /var/lib/apt/lists/* 29 | 30 | # Install Poetry using pip and clear cache 31 | RUN pip install --no-cache-dir "poetry==$POETRY_VERSION" 32 | RUN poetry config installer.max-workers 20 33 | 34 | WORKDIR $WORKSPACE_ROOT 35 | 36 | # Copy the poetry lock file and pyproject.toml file to install dependencies 37 | COPY pyproject.toml poetry.lock $WORKSPACE_ROOT 38 | 39 | # Install the dependencies and clear cache 40 | RUN poetry config virtualenvs.create false && \ 41 | poetry install --no-root --no-interaction --no-cache --without dev && \ 42 | poetry self add 'poethepoet[poetry_plugin]' && \ 43 | rm -rf ~/.cache/pypoetry/cache/ && \ 44 | rm -rf ~/.cache/pypoetry/artifacts/ 45 | 46 | # Copy the rest of the code. 47 | COPY . $WORKSPACE_ROOT 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /code_snippets/03_custom_odm_example.py: -------------------------------------------------------------------------------- 1 | from llm_engineering.domain.documents import ArticleDocument, UserDocument 2 | 3 | if __name__ == "__main__": 4 | user = UserDocument.get_or_create(first_name="Paul", last_name="Iusztin") 5 | articles = ArticleDocument.bulk_find(author_id=str(user.id)) 6 | 7 | print(f"User ID: {user.id}") # noqa 8 | print(f"User name: {user.first_name} {user.last_name}") # noqa 9 | print(f"Number of articles: {len(articles)}") # noqa 10 | print("First article link:", articles[0].link) # noqa 11 | -------------------------------------------------------------------------------- /code_snippets/03_orm.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String, create_engine 2 | from sqlalchemy.orm import declarative_base, sessionmaker 3 | 4 | # Create virtual environment, install dependencies and run the code: 5 | # 1. Create: python3 -m venv orm_venv 6 | # 2. Activate: source orm_venv/bin/activate 7 | # 3. Install: pip install sqlalchemy==2.0.35 8 | # 4. Run the code: python code_snippets/03_orm.py 9 | 10 | if __name__ == "__main__": 11 | Base = declarative_base() 12 | 13 | # Define a class that maps to the users table. 14 | class User(Base): 15 | __tablename__ = "users" 16 | 17 | id = Column(Integer, primary_key=True) 18 | name = Column(String) 19 | 20 | # Create an SQLite database in memory. 21 | engine = create_engine("sqlite:///:memory:") 22 | Base.metadata.create_all(engine) 23 | 24 | # Create a session used to interact with the database. 25 | Session = sessionmaker(bind=engine) 26 | session = Session() 27 | 28 | # Add a new user. 29 | new_user = User(name="Alice") 30 | session.add(new_user) 31 | session.commit() 32 | 33 | # Query the database. 34 | user = session.query(User).first() 35 | if user: 36 | print(f"User ID: {user.id}") # noqa 37 | print(f"User name: {user.name}") # noqa 38 | -------------------------------------------------------------------------------- /code_snippets/08_instructor_embeddings.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | 3 | # Create virtual environment, install dependencies and run the code: 4 | # 1. Create: python3 -m venv instructor_venv 5 | # 2. Activate: source instructor_venv/bin/activate 6 | # 3. Install: pip install sentence-transformers==3.3.0 7 | # 4. Run the code: python code_snippets/08_instructor_embeddings.py 8 | 9 | if __name__ == "__main__": 10 | model = SentenceTransformer("hkunlp/instructor-base") 11 | 12 | sentence = "RAG Fundamentals First" 13 | 14 | instruction = "Represent the title of an article about AI:" 15 | 16 | embeddings = model.encode([[instruction, sentence]]) 17 | print(embeddings.shape) # noqa 18 | # Output: (1, 768) 19 | -------------------------------------------------------------------------------- /code_snippets/08_text_embeddings.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | 3 | # Leverage the Poetry virtual environment to run the code: 4 | # poetry run python code_snippets/08_text_embeddings.py 5 | 6 | if __name__ == "__main__": 7 | # 1. Load a pretrained Sentence Transformer model. 8 | model = SentenceTransformer("all-MiniLM-L6-v2") 9 | 10 | # The sentences to encode. 11 | sentences = ["The dog sits outside waiting for a treat.", "I am going swimming.", "The dog is swimming."] 12 | 13 | # 2. Calculate embeddings. 14 | embeddings = model.encode(sentences) 15 | print(embeddings.shape) # noqa 16 | # Output: [3, 384] 17 | 18 | # 3. Calculate the embedding similarities using cosine similarity. 19 | similarities = model.similarity(embeddings, embeddings) 20 | print(similarities) # noqa 21 | # Output: 22 | # tensor([[ 1.0000, -0.0389, 0.2692], 23 | # [-0.0389, 1.0000, 0.3837], 24 | # [ 0.2692, 0.3837, 1.0000]]) 25 | # 26 | # similarities[0, 0] = The similarity between the first sentence and itself. 27 | # similarities[0, 1] = The similarity between the first and second sentence. 28 | # similarities[2, 1] = The similarity between the third and second sentence. 29 | -------------------------------------------------------------------------------- /code_snippets/08_text_image_embeddings.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | import requests 4 | from PIL import Image 5 | from sentence_transformers import SentenceTransformer 6 | 7 | # Leverage the Poetry virtual environment to run the code: 8 | # poetry run python code_snippets/08_text_image_embeddings.py 9 | 10 | if __name__ == "__main__": 11 | # Load an image with a crazy cat. 12 | response = requests.get( 13 | "https://github.com/PacktPublishing/LLM-Engineering/blob/main/images/crazy_cat.jpg?raw=true" 14 | ) 15 | image = Image.open(BytesIO(response.content)) 16 | 17 | # Load CLIP model. 18 | model = SentenceTransformer("clip-ViT-B-32") 19 | 20 | # Encode the loaded image. 21 | img_emb = model.encode(image) 22 | 23 | # Encode text descriptions. 24 | text_emb = model.encode( 25 | [ 26 | "A crazy cat smiling.", 27 | "A white and brown cat with a yellow bandana.", 28 | "A man eating in the garden.", 29 | ] 30 | ) 31 | print(text_emb.shape) # noqa 32 | # Output: (3, 512) 33 | 34 | # Compute similarities. 35 | similarity_scores = model.similarity(img_emb, text_emb) 36 | print(similarity_scores) # noqa 37 | # Output: tensor([[0.3068, 0.3300, 0.1719]]) 38 | -------------------------------------------------------------------------------- /configs/digital_data_etl_maxime_labonne.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | docker: 3 | parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest 4 | skip_build: True 5 | orchestrator.sagemaker: 6 | synchronous: false 7 | 8 | parameters: 9 | user_full_name: Maxime Labonne # [First Name(s)] [Last Name] 10 | links: 11 | # Personal Blog 12 | - https://mlabonne.github.io/blog/posts/2024-07-29_Finetune_Llama31.html 13 | - https://mlabonne.github.io/blog/posts/2024-07-15_The_Rise_of_Agentic_Data_Generation.html 14 | # Substack 15 | - https://maximelabonne.substack.com/p/uncensor-any-llm-with-abliteration-d30148b7d43e 16 | - https://maximelabonne.substack.com/p/create-mixtures-of-experts-with-mergekit-11b318c99562 17 | - https://maximelabonne.substack.com/p/merge-large-language-models-with-mergekit-2118fb392b54 18 | - https://maximelabonne.substack.com/p/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac 19 | - https://maximelabonne.substack.com/p/exllamav2-the-fastest-library-to-run-llms-32aeda294d26 20 | - https://maximelabonne.substack.com/p/quantize-llama-models-with-ggml-and-llama-cpp-3612dfbcc172 21 | - https://maximelabonne.substack.com/p/a-beginners-guide-to-llm-fine-tuning-4bae7d4da672 22 | - https://maximelabonne.substack.com/p/graph-convolutional-networks-introduction-to-gnns-24b3f60d6c95 23 | - https://maximelabonne.substack.com/p/4-bit-quantization-with-gptq-36b0f4f02c34 24 | - https://maximelabonne.substack.com/p/fine-tune-your-own-llama-2-model-in-a-colab-notebook-df9823a04a32 25 | - https://maximelabonne.substack.com/p/introduction-to-weight-quantization-2494701b9c0c 26 | - https://maximelabonne.substack.com/p/decoding-strategies-in-large-language-models-9733a8f70539 27 | - https://maximelabonne.substack.com/p/the-art-of-spending-optimizing-your-marketing-budget-with-nonlinear-optimization-6c8a39afb3c2 28 | - https://maximelabonne.substack.com/p/create-a-bot-to-find-diamonds-in-minecraft-d836606a993a 29 | - https://maximelabonne.substack.com/p/constraint-programming-67ac16fa0c81 30 | - https://maximelabonne.substack.com/p/how-to-design-the-most-powerful-graph-neural-network-3d18b07a6e66 31 | - https://maximelabonne.substack.com/p/introduction-to-graphsage-in-python-a9e7f9ecf9d7 32 | - https://maximelabonne.substack.com/p/graph-attention-networks-in-python-975736ac5c0c 33 | - https://maximelabonne.substack.com/p/integer-programming-vs-linear-programming-in-python-f1be5bb4e60e 34 | - https://maximelabonne.substack.com/p/introduction-to-linear-programming-in-python-9261e7eb44b 35 | - https://maximelabonne.substack.com/p/what-is-a-tensor-in-deep-learning-6dedd95d6507 36 | - https://maximelabonne.substack.com/p/efficiently-iterating-over-rows-in-a-pandas-dataframe-7dd5f9992c01 37 | - https://maximelabonne.substack.com/p/q-learning-for-beginners-2837b777741 38 | - https://maximelabonne.substack.com/p/how-to-start-machine-learning-for-developers-in-2022-390af12b193f 39 | -------------------------------------------------------------------------------- /configs/digital_data_etl_paul_iusztin.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | docker: 3 | parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest 4 | skip_build: True 5 | orchestrator.sagemaker: 6 | synchronous: false 7 | 8 | parameters: 9 | user_full_name: Paul Iusztin # [First Name(s)] [Last Name] 10 | links: 11 | # Medium (only articles that are not under the paid wall work) 12 | - https://medium.com/decodingml/an-end-to-end-framework-for-production-ready-llm-systems-by-building-your-llm-twin-2cc6bb01141f 13 | - https://medium.com/decodingml/a-real-time-retrieval-system-for-rag-on-social-media-data-9cc01d50a2a0 14 | - https://medium.com/decodingml/sota-python-streaming-pipelines-for-fine-tuning-llms-and-rag-in-real-time-82eb07795b87 15 | - https://medium.com/decodingml/the-4-advanced-rag-algorithms-you-must-know-to-implement-5d0c7f1199d2 16 | - https://medium.com/decodingml/architect-scalable-and-cost-effective-llm-rag-inference-pipelines-73b94ef82a99 17 | # Substack 18 | - https://decodingml.substack.com/p/real-time-feature-pipelines-with?r=1ttoeh 19 | - https://decodingml.substack.com/p/building-ml-systems-the-right-way?r=1ttoeh 20 | - https://decodingml.substack.com/p/reduce-your-pytorchs-code-latency?r=1ttoeh 21 | - https://decodingml.substack.com/p/llm-agents-demystified?r=1ttoeh 22 | - https://decodingml.substack.com/p/scalable-rag-ingestion-pipeline-using?r=1ttoeh 23 | - https://decodingml.substack.com/p/the-ultimate-mlops-tool?r=1ttoeh 24 | - https://decodingml.substack.com/p/the-new-king-of-infrastructure-as?r=1ttoeh 25 | - https://decodingml.substack.com/p/highly-scalable-data-ingestion-architecture?r=1ttoeh 26 | - https://decodingml.substack.com/p/2-key-llmops-concepts?r=1ttoeh 27 | - https://decodingml.substack.com/p/the-llm-twin-free-course-on-production?r=1ttoeh 28 | - https://decodingml.substack.com/p/a-blueprint-for-designing-production?r=1ttoeh 29 | - https://decodingml.substack.com/p/the-difference-between-development?r=1ttoeh 30 | - https://decodingml.substack.com/p/architect-scalable-and-cost-effective?r=1ttoeh 31 | - https://decodingml.substack.com/p/7-tips-to-reduce-your-vram-when-training?r=1ttoeh 32 | - https://decodingml.substack.com/p/using-this-python-package-you-can?r=1ttoeh 33 | - https://decodingml.substack.com/p/the-4-advanced-rag-algorithms-you?r=1ttoeh 34 | - https://decodingml.substack.com/p/problems-deploying-your-ml-models?r=1ttoeh 35 | - https://decodingml.substack.com/p/sota-python-streaming-pipelines-for?r=1ttoeh 36 | - https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh 37 | - https://decodingml.substack.com/p/my-ml-monthly-learning-resource-recommendations?r=1ttoeh 38 | - https://decodingml.substack.com/p/an-end-to-end-framework-for-production?r=1ttoeh 39 | - https://decodingml.substack.com/p/upskill-your-llm-knowledge-base-with?r=1ttoeh 40 | - https://decodingml.substack.com/p/want-to-learn-an-end-to-end-framework?r=1ttoeh 41 | - https://decodingml.substack.com/p/my-favorite-way-to-implement-a-configuration?r=1ttoeh 42 | - https://decodingml.substack.com/p/a-real-time-retrieval-system-for?r=1ttoeh 43 | - https://decodingml.substack.com/p/4-key-decoding-strategies-for-llms?r=1ttoeh 44 | - https://decodingml.substack.com/p/dml-new-year-the-new-and-improved?r=1ttoeh 45 | - https://decodingml.substack.com/p/dml-8-types-of-mlops-tools-that-must?r=1ttoeh 46 | - https://decodingml.substack.com/p/dml-this-is-what-you-need-to-build?r=1ttoeh 47 | - https://decodingml.substack.com/p/dml-7-steps-on-how-to-fine-tune-an?r=1ttoeh 48 | - https://decodingml.substack.com/p/dml-how-do-you-generate-a-q-and-a?r=1ttoeh 49 | - https://decodingml.substack.com/p/dml-what-do-you-need-to-fine-tune?r=1ttoeh 50 | - https://decodingml.substack.com/p/dml-why-and-when-do-you-need-to-fine?r=1ttoeh 51 | - https://decodingml.substack.com/p/dml-how-to-implement-a-streaming?r=1ttoeh 52 | - https://decodingml.substack.com/p/dml-why-and-what-do-you-need-a-streaming?r=1ttoeh 53 | - https://decodingml.substack.com/p/dml-unwrapping-the-3-pipeline-design?r=1ttoeh 54 | - https://decodingml.substack.com/p/dml-how-to-design-an-llm-system-for?r=1ttoeh 55 | - https://decodingml.substack.com/p/dml-synced-vector-dbs-a-guide-to?r=1ttoeh 56 | - https://decodingml.substack.com/p/dml-what-is-the-difference-between?r=1ttoeh 57 | - https://decodingml.substack.com/p/dml-7-steps-to-build-a-production?r=1ttoeh 58 | - https://decodingml.substack.com/p/dml-chain-of-thought-reasoning-write?r=1ttoeh 59 | - https://decodingml.substack.com/p/dml-build-and-serve-a-production?r=1ttoeh 60 | - https://decodingml.substack.com/p/dml-4-key-ideas-you-must-know-to?r=1ttoeh 61 | - https://decodingml.substack.com/p/dml-how-to-add-real-time-monitoring?r=1ttoeh 62 | - https://decodingml.substack.com/p/dml-top-6-ml-platform-features-you?r=1ttoeh -------------------------------------------------------------------------------- /configs/end_to_end_data.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | docker: 3 | parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest 4 | skip_build: True 5 | orchestrator.sagemaker: 6 | synchronous: false 7 | 8 | parameters: 9 | # Data ETL & Feature engineering pipelines parameters 10 | author_links: 11 | - user_full_name: Paul Iusztin # [First Name(s)] [Last Name] 12 | links: 13 | # Medium (only articles that are not under the paid wall work) 14 | - https://medium.com/decodingml/an-end-to-end-framework-for-production-ready-llm-systems-by-building-your-llm-twin-2cc6bb01141f 15 | - https://medium.com/decodingml/a-real-time-retrieval-system-for-rag-on-social-media-data-9cc01d50a2a0 16 | - https://medium.com/decodingml/sota-python-streaming-pipelines-for-fine-tuning-llms-and-rag-in-real-time-82eb07795b87 17 | - https://medium.com/decodingml/the-4-advanced-rag-algorithms-you-must-know-to-implement-5d0c7f1199d2 18 | - https://medium.com/decodingml/architect-scalable-and-cost-effective-llm-rag-inference-pipelines-73b94ef82a99 19 | # Substack 20 | - https://decodingml.substack.com/p/a-blueprint-for-designing-production?r=1ttoeh 21 | - https://decodingml.substack.com/p/the-difference-between-development?r=1ttoeh 22 | - https://decodingml.substack.com/p/architect-scalable-and-cost-effective?r=1ttoeh 23 | - https://decodingml.substack.com/p/7-tips-to-reduce-your-vram-when-training?r=1ttoeh 24 | - https://decodingml.substack.com/p/using-this-python-package-you-can?r=1ttoeh 25 | - https://decodingml.substack.com/p/the-4-advanced-rag-algorithms-you?r=1ttoeh 26 | - https://decodingml.substack.com/p/problems-deploying-your-ml-models?r=1ttoeh 27 | - https://decodingml.substack.com/p/sota-python-streaming-pipelines-for?r=1ttoeh 28 | - https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh 29 | - https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh 30 | - https://decodingml.substack.com/p/my-ml-monthly-learning-resource-recommendations?r=1ttoeh 31 | - https://decodingml.substack.com/p/an-end-to-end-framework-for-production?r=1ttoeh 32 | - https://decodingml.substack.com/p/upskill-your-llm-knowledge-base-with?r=1ttoeh 33 | - https://decodingml.substack.com/p/want-to-learn-an-end-to-end-framework?r=1ttoeh 34 | - https://decodingml.substack.com/p/my-favorite-way-to-implement-a-configuration?r=1ttoeh 35 | - https://decodingml.substack.com/p/a-real-time-retrieval-system-for?r=1ttoeh 36 | - https://decodingml.substack.com/p/4-key-decoding-strategies-for-llms?r=1ttoeh 37 | - https://decodingml.substack.com/p/dml-new-year-the-new-and-improved?r=1ttoeh 38 | - https://decodingml.substack.com/p/dml-8-types-of-mlops-tools-that-must?r=1ttoeh 39 | - https://decodingml.substack.com/p/dml-this-is-what-you-need-to-build?r=1ttoeh 40 | - https://decodingml.substack.com/p/dml-7-steps-on-how-to-fine-tune-an?r=1ttoeh 41 | - https://decodingml.substack.com/p/dml-how-do-you-generate-a-q-and-a?r=1ttoeh 42 | - https://decodingml.substack.com/p/dml-what-do-you-need-to-fine-tune?r=1ttoeh 43 | - https://decodingml.substack.com/p/dml-why-and-when-do-you-need-to-fine?r=1ttoeh 44 | - https://decodingml.substack.com/p/dml-how-to-implement-a-streaming?r=1ttoeh 45 | - https://decodingml.substack.com/p/dml-why-and-what-do-you-need-a-streaming?r=1ttoeh 46 | - https://decodingml.substack.com/p/dml-unwrapping-the-3-pipeline-design?r=1ttoeh 47 | - https://decodingml.substack.com/p/dml-how-to-design-an-llm-system-for?r=1ttoeh 48 | - https://decodingml.substack.com/p/dml-synced-vector-dbs-a-guide-to?r=1ttoeh 49 | - https://decodingml.substack.com/p/dml-what-is-the-difference-between?r=1ttoeh 50 | - https://decodingml.substack.com/p/dml-7-steps-to-build-a-production?r=1ttoeh 51 | - https://decodingml.substack.com/p/dml-chain-of-thought-reasoning-write?r=1ttoeh 52 | - https://decodingml.substack.com/p/dml-build-and-serve-a-production?r=1ttoeh 53 | - https://decodingml.substack.com/p/dml-4-key-ideas-you-must-know-to?r=1ttoeh 54 | - https://decodingml.substack.com/p/dml-how-to-add-real-time-monitoring?r=1ttoeh 55 | - https://decodingml.substack.com/p/dml-top-6-ml-platform-features-you?r=1ttoeh 56 | - user_full_name: Maxime Labonne # [First Name(s)] [Last Name] 57 | links: 58 | # Substack 59 | - https://maximelabonne.substack.com/p/uncensor-any-llm-with-abliteration-d30148b7d43e 60 | - https://maximelabonne.substack.com/p/create-mixtures-of-experts-with-mergekit-11b318c99562 61 | - https://maximelabonne.substack.com/p/merge-large-language-models-with-mergekit-2118fb392b54 62 | - https://maximelabonne.substack.com/p/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac 63 | - https://maximelabonne.substack.com/p/exllamav2-the-fastest-library-to-run-llms-32aeda294d26 64 | - https://maximelabonne.substack.com/p/quantize-llama-models-with-ggml-and-llama-cpp-3612dfbcc172 65 | - https://maximelabonne.substack.com/p/a-beginners-guide-to-llm-fine-tuning-4bae7d4da672 66 | - https://maximelabonne.substack.com/p/graph-convolutional-networks-introduction-to-gnns-24b3f60d6c95 67 | - https://maximelabonne.substack.com/p/4-bit-quantization-with-gptq-36b0f4f02c34 68 | - https://maximelabonne.substack.com/p/fine-tune-your-own-llama-2-model-in-a-colab-notebook-df9823a04a32 69 | - https://maximelabonne.substack.com/p/introduction-to-weight-quantization-2494701b9c0c 70 | - https://maximelabonne.substack.com/p/decoding-strategies-in-large-language-models-9733a8f70539 71 | - https://maximelabonne.substack.com/p/the-art-of-spending-optimizing-your-marketing-budget-with-nonlinear-optimization-6c8a39afb3c2 72 | - https://maximelabonne.substack.com/p/create-a-bot-to-find-diamonds-in-minecraft-d836606a993a 73 | - https://maximelabonne.substack.com/p/constraint-programming-67ac16fa0c81 74 | - https://maximelabonne.substack.com/p/how-to-design-the-most-powerful-graph-neural-network-3d18b07a6e66 75 | - https://maximelabonne.substack.com/p/introduction-to-graphsage-in-python-a9e7f9ecf9d7 76 | - https://maximelabonne.substack.com/p/graph-attention-networks-in-python-975736ac5c0c 77 | - https://maximelabonne.substack.com/p/integer-programming-vs-linear-programming-in-python-f1be5bb4e60e 78 | - https://maximelabonne.substack.com/p/introduction-to-linear-programming-in-python-9261e7eb44b 79 | - https://maximelabonne.substack.com/p/what-is-a-tensor-in-deep-learning-6dedd95d6507 80 | - https://maximelabonne.substack.com/p/efficiently-iterating-over-rows-in-a-pandas-dataframe-7dd5f9992c01 81 | - https://maximelabonne.substack.com/p/q-learning-for-beginners-2837b777741 82 | - https://maximelabonne.substack.com/p/how-to-start-machine-learning-for-developers-in-2022-390af12b193f 83 | # Generate instruct dataset pipeline parameters 84 | test_split_size: 0.1 85 | push_to_huggingface: false 86 | dataset_id: pauliusztin/llmtwin 87 | mock: false -------------------------------------------------------------------------------- /configs/evaluating.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | docker: 3 | parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest 4 | skip_build: True 5 | orchestrator.sagemaker: 6 | synchronous: false 7 | 8 | parameters: 9 | is_dummy: true # Change this to 'false' to run the evaluation on the full dataset. 10 | -------------------------------------------------------------------------------- /configs/export_artifact_to_json.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | docker: 3 | parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest 4 | skip_build: True 5 | orchestrator.sagemaker: 6 | synchronous: false 7 | 8 | parameters: 9 | artifact_names: 10 | - raw_documents 11 | - cleaned_documents 12 | - instruct_datasets 13 | - preference_datasets 14 | -------------------------------------------------------------------------------- /configs/feature_engineering.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | docker: 3 | parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest 4 | skip_build: True 5 | orchestrator.sagemaker: 6 | synchronous: false 7 | 8 | parameters: 9 | author_full_names: 10 | - Maxime Labonne 11 | - Paul Iusztin 12 | -------------------------------------------------------------------------------- /configs/generate_instruct_datasets.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | docker: 3 | parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest 4 | skip_build: True 5 | orchestrator.sagemaker: 6 | synchronous: false 7 | 8 | parameters: 9 | test_split_size: 0.1 10 | dataset_type: "instruction" 11 | push_to_huggingface: true 12 | dataset_id: pauliusztin/llmtwin 13 | mock: false 14 | -------------------------------------------------------------------------------- /configs/generate_preference_datasets.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | docker: 3 | parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest 4 | skip_build: True 5 | orchestrator.sagemaker: 6 | synchronous: false 7 | 8 | parameters: 9 | test_split_size: 0.05 10 | dataset_type: "preference" 11 | push_to_huggingface: true 12 | dataset_id: pauliusztin/llmtwin-dpo 13 | mock: false 14 | -------------------------------------------------------------------------------- /configs/training.yaml: -------------------------------------------------------------------------------- 1 | settings: 2 | docker: 3 | parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest 4 | skip_build: True 5 | orchestrator.sagemaker: 6 | synchronous: false 7 | 8 | parameters: 9 | finetuning_type: sft 10 | num_train_epochs: 3 11 | per_device_train_batch_size: 2 12 | learning_rate: 3e-4 13 | dataset_huggingface_workspace: mlabonne 14 | is_dummy: true # Change this to 'false' to run the training with the full dataset and epochs. 15 | -------------------------------------------------------------------------------- /data/data_warehouse_raw_data/PostDocument.json: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /data/data_warehouse_raw_data/RepositoryDocument.json: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /data/data_warehouse_raw_data/UserDocument.json: -------------------------------------------------------------------------------- 1 | [{"first_name": "Maxime", "last_name": "Labonne", "_id": "eff74089-0271-4319-8543-745c087f4f61"}, {"first_name": "Paul", "last_name": "Iusztin", "_id": "b5fa1f08-75f0-402d-8e88-d1357e346d9e"}] -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | mongo: 3 | image: mongo:latest 4 | container_name: "llm_engineering_mongo" 5 | logging: 6 | options: 7 | max-size: 1g 8 | environment: 9 | MONGO_INITDB_ROOT_USERNAME: "llm_engineering" 10 | MONGO_INITDB_ROOT_PASSWORD: "llm_engineering" 11 | ports: 12 | - 27017:27017 13 | volumes: 14 | - mongo_data:/data/db 15 | networks: 16 | - local 17 | restart: always 18 | 19 | qdrant: 20 | image: qdrant/qdrant:latest 21 | container_name: "llm_engineering_qdrant" 22 | ports: 23 | - 6333:6333 24 | - 6334:6334 25 | expose: 26 | - 6333 27 | - 6334 28 | volumes: 29 | - qdrant_data:/qdrant/storage 30 | networks: 31 | - local 32 | restart: always 33 | 34 | volumes: 35 | mongo_data: 36 | qdrant_data: 37 | 38 | networks: 39 | local: 40 | driver: bridge -------------------------------------------------------------------------------- /images/cover_plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/images/cover_plus.png -------------------------------------------------------------------------------- /images/crazy_cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/images/crazy_cat.jpg -------------------------------------------------------------------------------- /llm_engineering/__init__.py: -------------------------------------------------------------------------------- 1 | from llm_engineering import application, domain, infrastructure 2 | from llm_engineering.settings import settings 3 | 4 | __all__ = ["settings", "application", "domain", "infrastructure"] 5 | -------------------------------------------------------------------------------- /llm_engineering/application/__init__.py: -------------------------------------------------------------------------------- 1 | from . import utils 2 | 3 | __all__ = ["utils"] 4 | -------------------------------------------------------------------------------- /llm_engineering/application/crawlers/__init__.py: -------------------------------------------------------------------------------- 1 | from .dispatcher import CrawlerDispatcher 2 | from .github import GithubCrawler 3 | from .linkedin import LinkedInCrawler 4 | from .medium import MediumCrawler 5 | 6 | __all__ = ["CrawlerDispatcher", "GithubCrawler", "LinkedInCrawler", "MediumCrawler"] 7 | -------------------------------------------------------------------------------- /llm_engineering/application/crawlers/base.py: -------------------------------------------------------------------------------- 1 | import time 2 | from abc import ABC, abstractmethod 3 | from tempfile import mkdtemp 4 | 5 | import chromedriver_autoinstaller 6 | from selenium import webdriver 7 | from selenium.webdriver.chrome.options import Options 8 | 9 | from llm_engineering.domain.documents import NoSQLBaseDocument 10 | 11 | # Check if the current version of chromedriver exists 12 | # and if it doesn't exist, download it automatically, 13 | # then add chromedriver to path 14 | chromedriver_autoinstaller.install() 15 | 16 | 17 | class BaseCrawler(ABC): 18 | model: type[NoSQLBaseDocument] 19 | 20 | @abstractmethod 21 | def extract(self, link: str, **kwargs) -> None: ... 22 | 23 | 24 | class BaseSeleniumCrawler(BaseCrawler, ABC): 25 | def __init__(self, scroll_limit: int = 5) -> None: 26 | options = webdriver.ChromeOptions() 27 | 28 | options.add_argument("--no-sandbox") 29 | options.add_argument("--headless=new") 30 | options.add_argument("--disable-dev-shm-usage") 31 | options.add_argument("--log-level=3") 32 | options.add_argument("--disable-popup-blocking") 33 | options.add_argument("--disable-notifications") 34 | options.add_argument("--disable-extensions") 35 | options.add_argument("--disable-background-networking") 36 | options.add_argument("--ignore-certificate-errors") 37 | options.add_argument(f"--user-data-dir={mkdtemp()}") 38 | options.add_argument(f"--data-path={mkdtemp()}") 39 | options.add_argument(f"--disk-cache-dir={mkdtemp()}") 40 | options.add_argument("--remote-debugging-port=9226") 41 | 42 | self.set_extra_driver_options(options) 43 | 44 | self.scroll_limit = scroll_limit 45 | self.driver = webdriver.Chrome( 46 | options=options, 47 | ) 48 | 49 | def set_extra_driver_options(self, options: Options) -> None: 50 | pass 51 | 52 | def login(self) -> None: 53 | pass 54 | 55 | def scroll_page(self) -> None: 56 | """Scroll through the LinkedIn page based on the scroll limit.""" 57 | current_scroll = 0 58 | last_height = self.driver.execute_script("return document.body.scrollHeight") 59 | while True: 60 | self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 61 | time.sleep(5) 62 | new_height = self.driver.execute_script("return document.body.scrollHeight") 63 | if new_height == last_height or (self.scroll_limit and current_scroll >= self.scroll_limit): 64 | break 65 | last_height = new_height 66 | current_scroll += 1 67 | -------------------------------------------------------------------------------- /llm_engineering/application/crawlers/custom_article.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | from langchain_community.document_loaders import AsyncHtmlLoader 4 | from langchain_community.document_transformers.html2text import Html2TextTransformer 5 | from loguru import logger 6 | 7 | from llm_engineering.domain.documents import ArticleDocument 8 | 9 | from .base import BaseCrawler 10 | 11 | 12 | class CustomArticleCrawler(BaseCrawler): 13 | model = ArticleDocument 14 | 15 | def __init__(self) -> None: 16 | super().__init__() 17 | 18 | def extract(self, link: str, **kwargs) -> None: 19 | old_model = self.model.find(link=link) 20 | if old_model is not None: 21 | logger.info(f"Article already exists in the database: {link}") 22 | 23 | return 24 | 25 | logger.info(f"Starting scrapping article: {link}") 26 | 27 | loader = AsyncHtmlLoader([link]) 28 | docs = loader.load() 29 | 30 | html2text = Html2TextTransformer() 31 | docs_transformed = html2text.transform_documents(docs) 32 | doc_transformed = docs_transformed[0] 33 | 34 | content = { 35 | "Title": doc_transformed.metadata.get("title"), 36 | "Subtitle": doc_transformed.metadata.get("description"), 37 | "Content": doc_transformed.page_content, 38 | "language": doc_transformed.metadata.get("language"), 39 | } 40 | 41 | parsed_url = urlparse(link) 42 | platform = parsed_url.netloc 43 | 44 | user = kwargs["user"] 45 | instance = self.model( 46 | content=content, 47 | link=link, 48 | platform=platform, 49 | author_id=user.id, 50 | author_full_name=user.full_name, 51 | ) 52 | instance.save() 53 | 54 | logger.info(f"Finished scrapping custom article: {link}") 55 | -------------------------------------------------------------------------------- /llm_engineering/application/crawlers/dispatcher.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib.parse import urlparse 3 | 4 | from loguru import logger 5 | 6 | from .base import BaseCrawler 7 | from .custom_article import CustomArticleCrawler 8 | from .github import GithubCrawler 9 | from .linkedin import LinkedInCrawler 10 | from .medium import MediumCrawler 11 | 12 | 13 | class CrawlerDispatcher: 14 | def __init__(self) -> None: 15 | self._crawlers = {} 16 | 17 | @classmethod 18 | def build(cls) -> "CrawlerDispatcher": 19 | dispatcher = cls() 20 | 21 | return dispatcher 22 | 23 | def register_medium(self) -> "CrawlerDispatcher": 24 | self.register("https://medium.com", MediumCrawler) 25 | 26 | return self 27 | 28 | def register_linkedin(self) -> "CrawlerDispatcher": 29 | self.register("https://linkedin.com", LinkedInCrawler) 30 | 31 | return self 32 | 33 | def register_github(self) -> "CrawlerDispatcher": 34 | self.register("https://github.com", GithubCrawler) 35 | 36 | return self 37 | 38 | def register(self, domain: str, crawler: type[BaseCrawler]) -> None: 39 | parsed_domain = urlparse(domain) 40 | domain = parsed_domain.netloc 41 | 42 | self._crawlers[r"https://(www\.)?{}/*".format(re.escape(domain))] = crawler 43 | 44 | def get_crawler(self, url: str) -> BaseCrawler: 45 | for pattern, crawler in self._crawlers.items(): 46 | if re.match(pattern, url): 47 | return crawler() 48 | else: 49 | logger.warning(f"No crawler found for {url}. Defaulting to CustomArticleCrawler.") 50 | 51 | return CustomArticleCrawler() 52 | -------------------------------------------------------------------------------- /llm_engineering/application/crawlers/github.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import subprocess 4 | import tempfile 5 | 6 | from loguru import logger 7 | 8 | from llm_engineering.domain.documents import RepositoryDocument 9 | 10 | from .base import BaseCrawler 11 | 12 | 13 | class GithubCrawler(BaseCrawler): 14 | model = RepositoryDocument 15 | 16 | def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None: 17 | super().__init__() 18 | self._ignore = ignore 19 | 20 | def extract(self, link: str, **kwargs) -> None: 21 | old_model = self.model.find(link=link) 22 | if old_model is not None: 23 | logger.info(f"Repository already exists in the database: {link}") 24 | 25 | return 26 | 27 | logger.info(f"Starting scrapping GitHub repository: {link}") 28 | 29 | repo_name = link.rstrip("/").split("/")[-1] 30 | 31 | local_temp = tempfile.mkdtemp() 32 | 33 | try: 34 | os.chdir(local_temp) 35 | subprocess.run(["git", "clone", link]) 36 | 37 | repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) # noqa: PTH118 38 | 39 | tree = {} 40 | for root, _, files in os.walk(repo_path): 41 | dir = root.replace(repo_path, "").lstrip("/") 42 | if dir.startswith(self._ignore): 43 | continue 44 | 45 | for file in files: 46 | if file.endswith(self._ignore): 47 | continue 48 | file_path = os.path.join(dir, file) # noqa: PTH118 49 | with open(os.path.join(root, file), "r", errors="ignore") as f: # noqa: PTH123, PTH118 50 | tree[file_path] = f.read().replace(" ", "") 51 | 52 | user = kwargs["user"] 53 | instance = self.model( 54 | content=tree, 55 | name=repo_name, 56 | link=link, 57 | platform="github", 58 | author_id=user.id, 59 | author_full_name=user.full_name, 60 | ) 61 | instance.save() 62 | 63 | except Exception: 64 | raise 65 | finally: 66 | shutil.rmtree(local_temp) 67 | 68 | logger.info(f"Finished scrapping GitHub repository: {link}") 69 | -------------------------------------------------------------------------------- /llm_engineering/application/crawlers/linkedin.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Dict, List 3 | 4 | from bs4 import BeautifulSoup 5 | from bs4.element import Tag 6 | from loguru import logger 7 | from selenium.webdriver.common.by import By 8 | 9 | from llm_engineering.domain.documents import PostDocument 10 | from llm_engineering.domain.exceptions import ImproperlyConfigured 11 | from llm_engineering.settings import settings 12 | 13 | from .base import BaseSeleniumCrawler 14 | 15 | 16 | class LinkedInCrawler(BaseSeleniumCrawler): 17 | model = PostDocument 18 | 19 | def __init__(self, scroll_limit: int = 5, is_deprecated: bool = True) -> None: 20 | super().__init__(scroll_limit) 21 | 22 | self._is_deprecated = is_deprecated 23 | 24 | def set_extra_driver_options(self, options) -> None: 25 | options.add_experimental_option("detach", True) 26 | 27 | def login(self) -> None: 28 | if self._is_deprecated: 29 | raise DeprecationWarning( 30 | "As LinkedIn has updated its security measures, the login() method is no longer supported." 31 | ) 32 | 33 | self.driver.get("https://www.linkedin.com/login") 34 | if not settings.LINKEDIN_USERNAME or not settings.LINKEDIN_PASSWORD: 35 | raise ImproperlyConfigured( 36 | "LinkedIn scraper requires the {LINKEDIN_USERNAME} and {LINKEDIN_PASSWORD} settings." 37 | ) 38 | 39 | self.driver.find_element(By.ID, "username").send_keys(settings.LINKEDIN_USERNAME) 40 | self.driver.find_element(By.ID, "password").send_keys(settings.LINKEDIN_PASSWORD) 41 | self.driver.find_element(By.CSS_SELECTOR, ".login__form_action_container button").click() 42 | 43 | def extract(self, link: str, **kwargs) -> None: 44 | if self._is_deprecated: 45 | raise DeprecationWarning( 46 | "As LinkedIn has updated its feed structure, the extract() method is no longer supported." 47 | ) 48 | 49 | if self.model.link is not None: 50 | old_model = self.model.find(link=link) 51 | if old_model is not None: 52 | logger.info(f"Post already exists in the database: {link}") 53 | 54 | return 55 | 56 | logger.info(f"Starting scrapping data for profile: {link}") 57 | 58 | self.login() 59 | 60 | soup = self._get_page_content(link) 61 | 62 | data = { # noqa 63 | "Name": self._scrape_section(soup, "h1", class_="text-heading-xlarge"), 64 | "About": self._scrape_section(soup, "div", class_="display-flex ph5 pv3"), 65 | "Main Page": self._scrape_section(soup, "div", {"id": "main-content"}), 66 | "Experience": self._scrape_experience(link), 67 | "Education": self._scrape_education(link), 68 | } 69 | 70 | self.driver.get(link) 71 | time.sleep(5) 72 | button = self.driver.find_element( 73 | By.CSS_SELECTOR, ".app-aware-link.profile-creator-shared-content-view__footer-action" 74 | ) 75 | button.click() 76 | 77 | # Scrolling and scraping posts 78 | self.scroll_page() 79 | soup = BeautifulSoup(self.driver.page_source, "html.parser") 80 | post_elements = soup.find_all( 81 | "div", 82 | class_="update-components-text relative update-components-update-v2__commentary", 83 | ) 84 | buttons = soup.find_all("button", class_="update-components-image__image-link") 85 | post_images = self._extract_image_urls(buttons) 86 | 87 | posts = self._extract_posts(post_elements, post_images) 88 | logger.info(f"Found {len(posts)} posts for profile: {link}") 89 | 90 | self.driver.close() 91 | 92 | user = kwargs["user"] 93 | self.model.bulk_insert( 94 | [ 95 | PostDocument(platform="linkedin", content=post, author_id=user.id, author_full_name=user.full_name) 96 | for post in posts 97 | ] 98 | ) 99 | 100 | logger.info(f"Finished scrapping data for profile: {link}") 101 | 102 | def _scrape_section(self, soup: BeautifulSoup, *args, **kwargs) -> str: 103 | """Scrape a specific section of the LinkedIn profile.""" 104 | # Example: Scrape the 'About' section 105 | 106 | parent_div = soup.find(*args, **kwargs) 107 | 108 | return parent_div.get_text(strip=True) if parent_div else "" 109 | 110 | def _extract_image_urls(self, buttons: List[Tag]) -> Dict[str, str]: 111 | """ 112 | Extracts image URLs from button elements. 113 | 114 | Args: 115 | buttons (List[Tag]): A list of BeautifulSoup Tag objects representing buttons. 116 | 117 | Returns: 118 | Dict[str, str]: A dictionary mapping post indexes to image URLs. 119 | """ 120 | 121 | post_images = {} 122 | for i, button in enumerate(buttons): 123 | img_tag = button.find("img") 124 | if img_tag and "src" in img_tag.attrs: 125 | post_images[f"Post_{i}"] = img_tag["src"] 126 | else: 127 | logger.warning("No image found in this button") 128 | return post_images 129 | 130 | def _get_page_content(self, url: str) -> BeautifulSoup: 131 | """Retrieve the page content of a given URL.""" 132 | 133 | self.driver.get(url) 134 | time.sleep(5) 135 | 136 | return BeautifulSoup(self.driver.page_source, "html.parser") 137 | 138 | def _extract_posts(self, post_elements: List[Tag], post_images: Dict[str, str]) -> Dict[str, Dict[str, str]]: 139 | """ 140 | Extracts post texts and combines them with their respective images. 141 | 142 | Args: 143 | post_elements (List[Tag]): A list of BeautifulSoup Tag objects representing post elements. 144 | post_images (Dict[str, str]): A dictionary containing image URLs mapped by post index. 145 | 146 | Returns: 147 | Dict[str, Dict[str, str]]: A dictionary containing post data with text and optional image URL. 148 | """ 149 | 150 | posts_data = {} 151 | for i, post_element in enumerate(post_elements): 152 | post_text = post_element.get_text(strip=True, separator="\n") 153 | post_data = {"text": post_text} 154 | if f"Post_{i}" in post_images: 155 | post_data["image"] = post_images[f"Post_{i}"] 156 | posts_data[f"Post_{i}"] = post_data 157 | 158 | return posts_data 159 | 160 | def _scrape_experience(self, profile_url: str) -> str: 161 | """Scrapes the Experience section of the LinkedIn profile.""" 162 | 163 | self.driver.get(profile_url + "/details/experience/") 164 | time.sleep(5) 165 | soup = BeautifulSoup(self.driver.page_source, "html.parser") 166 | experience_content = soup.find("section", {"id": "experience-section"}) 167 | 168 | return experience_content.get_text(strip=True) if experience_content else "" 169 | 170 | def _scrape_education(self, profile_url: str) -> str: 171 | self.driver.get(profile_url + "/details/education/") 172 | time.sleep(5) 173 | soup = BeautifulSoup(self.driver.page_source, "html.parser") 174 | education_content = soup.find("section", {"id": "education-section"}) 175 | 176 | return education_content.get_text(strip=True) if education_content else "" 177 | -------------------------------------------------------------------------------- /llm_engineering/application/crawlers/medium.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from loguru import logger 3 | 4 | from llm_engineering.domain.documents import ArticleDocument 5 | 6 | from .base import BaseSeleniumCrawler 7 | 8 | 9 | class MediumCrawler(BaseSeleniumCrawler): 10 | model = ArticleDocument 11 | 12 | def set_extra_driver_options(self, options) -> None: 13 | options.add_argument(r"--profile-directory=Profile 2") 14 | 15 | def extract(self, link: str, **kwargs) -> None: 16 | old_model = self.model.find(link=link) 17 | if old_model is not None: 18 | logger.info(f"Article already exists in the database: {link}") 19 | 20 | return 21 | 22 | logger.info(f"Starting scrapping Medium article: {link}") 23 | 24 | self.driver.get(link) 25 | self.scroll_page() 26 | 27 | soup = BeautifulSoup(self.driver.page_source, "html.parser") 28 | title = soup.find_all("h1", class_="pw-post-title") 29 | subtitle = soup.find_all("h2", class_="pw-subtitle-paragraph") 30 | 31 | data = { 32 | "Title": title[0].string if title else None, 33 | "Subtitle": subtitle[0].string if subtitle else None, 34 | "Content": soup.get_text(), 35 | } 36 | 37 | self.driver.close() 38 | 39 | user = kwargs["user"] 40 | instance = self.model( 41 | platform="medium", 42 | content=data, 43 | link=link, 44 | author_id=user.id, 45 | author_full_name=user.full_name, 46 | ) 47 | instance.save() 48 | 49 | logger.info(f"Successfully scraped and saved article: {link}") 50 | -------------------------------------------------------------------------------- /llm_engineering/application/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from . import generation 2 | 3 | __all__ = ["generation"] 4 | -------------------------------------------------------------------------------- /llm_engineering/application/dataset/constants.py: -------------------------------------------------------------------------------- 1 | from llm_engineering.domain.dataset import DatasetType 2 | 3 | MOCKED_RESPONSE_INSTRUCT = """ 4 | [ 5 | {"instruction": " 1", "answer": " 1"}, 6 | {"instruction": " 2", "answer": " 2"}, 7 | {"instruction": " 3", "answer": " 3"} 8 | ] 9 | """ 10 | 11 | MOCKED_RESPONSE_PREFERENCE = """ 12 | [ 13 | {"instruction": " 1", "rejected": " 1", "chosen": "Mocked extracted extracted extracted extracted extracted extracted extracted extracted extracted extracted answer 1."}, 14 | {"instruction": " 2", "rejected": " 2", "chosen": "Mocked extracted extracted extracted extracted extracted extracted extracted extracted extracted extracted answer 2."}, 15 | {"instruction": " 3", "rejected": " 3", "chosen": "Mocked extracted answer 3"} 16 | ] 17 | """ 18 | 19 | 20 | def get_mocked_response(dataset_type: DatasetType) -> str: 21 | if dataset_type == DatasetType.INSTRUCTION: 22 | return MOCKED_RESPONSE_INSTRUCT 23 | elif dataset_type == DatasetType.PREFERENCE: 24 | return MOCKED_RESPONSE_PREFERENCE 25 | else: 26 | raise ValueError(f"Invalid dataset type: {dataset_type}") 27 | -------------------------------------------------------------------------------- /llm_engineering/application/dataset/output_parsers.py: -------------------------------------------------------------------------------- 1 | from langchain.output_parsers import PydanticOutputParser 2 | 3 | 4 | class ListPydanticOutputParser(PydanticOutputParser): 5 | def _parse_obj(self, obj: dict | list): 6 | if isinstance(obj, list): 7 | return [super(ListPydanticOutputParser, self)._parse_obj(obj_) for obj_ in obj] 8 | else: 9 | return super(ListPydanticOutputParser, self)._parse_obj(obj) 10 | -------------------------------------------------------------------------------- /llm_engineering/application/dataset/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | 3 | from llm_engineering.application.preprocessing.operations.chunking import chunk_document 4 | from llm_engineering.domain.cleaned_documents import CleanedDocument 5 | from llm_engineering.domain.dataset import ( 6 | InstructDataset, 7 | InstructDatasetSample, 8 | InstructTrainTestSplit, 9 | PreferenceDataset, 10 | PreferenceDatasetSample, 11 | PreferenceTrainTestSplit, 12 | ) 13 | from llm_engineering.domain.types import DataCategory 14 | 15 | 16 | def create_instruct_train_test_split( 17 | data: dict[DataCategory, InstructDataset], test_size=0.2, random_state=42 18 | ) -> InstructTrainTestSplit: 19 | train_data = {} 20 | test_data = {} 21 | 22 | for category, dataset in data.items(): 23 | samples = dataset.samples 24 | samples_dicts = [sample.model_dump() for sample in samples] 25 | 26 | if len(samples_dicts) > 0: 27 | train_samples_dicts, test_samples_dicts = train_test_split( 28 | samples_dicts, test_size=test_size, random_state=random_state 29 | ) 30 | train_samples = [InstructDatasetSample(**sample_dict) for sample_dict in train_samples_dicts] 31 | test_samples = [InstructDatasetSample(**sample_dict) for sample_dict in test_samples_dicts] 32 | else: 33 | train_samples = [] 34 | test_samples = [] 35 | 36 | train_dataset = InstructDataset(category=category, samples=train_samples) 37 | test_dataset = InstructDataset(category=category, samples=test_samples) 38 | 39 | train_data[category] = train_dataset 40 | test_data[category] = test_dataset 41 | 42 | return InstructTrainTestSplit(train=train_data, test=test_data, test_split_size=test_size) 43 | 44 | 45 | def create_preference_train_test_split( 46 | data: dict[DataCategory, PreferenceDataset], test_size=0.2, random_state=42 47 | ) -> PreferenceTrainTestSplit: 48 | train_data = {} 49 | test_data = {} 50 | 51 | for category, dataset in data.items(): 52 | samples = dataset.samples 53 | samples_dicts = [sample.model_dump() for sample in samples] 54 | 55 | if len(samples_dicts) > 0: 56 | train_samples_dicts, test_samples_dicts = train_test_split( 57 | samples_dicts, test_size=test_size, random_state=random_state 58 | ) 59 | train_samples = [PreferenceDatasetSample(**sample_dict) for sample_dict in train_samples_dicts] 60 | test_samples = [PreferenceDatasetSample(**sample_dict) for sample_dict in test_samples_dicts] 61 | else: 62 | train_samples = [] 63 | test_samples = [] 64 | 65 | train_dataset = PreferenceDataset(category=category, samples=train_samples) 66 | test_dataset = PreferenceDataset(category=category, samples=test_samples) 67 | 68 | train_data[category] = train_dataset 69 | test_data[category] = test_dataset 70 | 71 | return PreferenceTrainTestSplit(train=train_data, test=test_data, test_split_size=test_size) 72 | 73 | 74 | def filter_short_answers( 75 | data: dict[DataCategory, PreferenceDataset], min_length: int = 100 76 | ) -> dict[DataCategory, PreferenceDataset]: 77 | def is_long_enough(example: PreferenceDatasetSample) -> bool: 78 | return len(example.chosen) >= min_length 79 | 80 | filtered_data = {} 81 | for category, dataset in data.items(): 82 | filetered_dataset_samples = list(filter(is_long_enough, dataset.samples)) 83 | filtered_dataset = PreferenceDataset(category=category, samples=filetered_dataset_samples) 84 | 85 | filtered_data[category] = filtered_dataset 86 | 87 | return filtered_data 88 | 89 | 90 | def filter_answer_format(data: dict[DataCategory, PreferenceDataset]) -> dict[DataCategory, PreferenceDataset]: 91 | def is_valid_format(example: PreferenceDatasetSample) -> bool: 92 | chosen = example.chosen 93 | 94 | return len(chosen) > 0 and chosen[0].isupper() and chosen[-1] in (".", "!", "?") 95 | 96 | filtered_data = {} 97 | for category, dataset in data.items(): 98 | filetered_dataset_samples = list(filter(is_valid_format, dataset.samples)) 99 | filtered_dataset = PreferenceDataset(category=category, samples=filetered_dataset_samples) 100 | 101 | filtered_data[category] = filtered_dataset 102 | 103 | return filtered_data 104 | 105 | 106 | def extract_substrings( 107 | documents: list[CleanedDocument], min_length: int = 1000, max_length: int = 2000 108 | ) -> list[CleanedDocument]: 109 | extracts = [] 110 | for document in documents: 111 | document_extracts = chunk_document(document.content, min_length, max_length) 112 | for extract in document_extracts: 113 | subdocument = document.model_copy() 114 | subdocument.content = extract 115 | 116 | extracts.append(subdocument) 117 | 118 | return extracts 119 | -------------------------------------------------------------------------------- /llm_engineering/application/networks/__init__.py: -------------------------------------------------------------------------------- 1 | from .embeddings import CrossEncoderModelSingleton, EmbeddingModelSingleton 2 | 3 | __all__ = ["EmbeddingModelSingleton", "CrossEncoderModelSingleton"] 4 | -------------------------------------------------------------------------------- /llm_engineering/application/networks/base.py: -------------------------------------------------------------------------------- 1 | from threading import Lock 2 | from typing import ClassVar 3 | 4 | 5 | class SingletonMeta(type): 6 | """ 7 | This is a thread-safe implementation of Singleton. 8 | """ 9 | 10 | _instances: ClassVar = {} 11 | 12 | _lock: Lock = Lock() 13 | 14 | """ 15 | We now have a lock object that will be used to synchronize threads during 16 | first access to the Singleton. 17 | """ 18 | 19 | def __call__(cls, *args, **kwargs): 20 | """ 21 | Possible changes to the value of the `__init__` argument do not affect 22 | the returned instance. 23 | """ 24 | # Now, imagine that the program has just been launched. Since there's no 25 | # Singleton instance yet, multiple threads can simultaneously pass the 26 | # previous conditional and reach this point almost at the same time. The 27 | # first of them will acquire lock and will proceed further, while the 28 | # rest will wait here. 29 | with cls._lock: 30 | # The first thread to acquire the lock, reaches this conditional, 31 | # goes inside and creates the Singleton instance. Once it leaves the 32 | # lock block, a thread that might have been waiting for the lock 33 | # release may then enter this section. But since the Singleton field 34 | # is already initialized, the thread won't create a new object. 35 | if cls not in cls._instances: 36 | instance = super().__call__(*args, **kwargs) 37 | cls._instances[cls] = instance 38 | 39 | return cls._instances[cls] 40 | -------------------------------------------------------------------------------- /llm_engineering/application/networks/embeddings.py: -------------------------------------------------------------------------------- 1 | from functools import cached_property 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | import numpy as np 6 | from loguru import logger 7 | from numpy.typing import NDArray 8 | from sentence_transformers.SentenceTransformer import SentenceTransformer 9 | from sentence_transformers.cross_encoder import CrossEncoder 10 | from transformers import AutoTokenizer 11 | 12 | from llm_engineering.settings import settings 13 | 14 | from .base import SingletonMeta 15 | 16 | 17 | class EmbeddingModelSingleton(metaclass=SingletonMeta): 18 | """ 19 | A singleton class that provides a pre-trained transformer model for generating embeddings of input text. 20 | """ 21 | 22 | def __init__( 23 | self, 24 | model_id: str = settings.TEXT_EMBEDDING_MODEL_ID, 25 | device: str = settings.RAG_MODEL_DEVICE, 26 | cache_dir: Optional[Path] = None, 27 | ) -> None: 28 | self._model_id = model_id 29 | self._device = device 30 | 31 | self._model = SentenceTransformer( 32 | self._model_id, 33 | device=self._device, 34 | cache_folder=str(cache_dir) if cache_dir else None, 35 | ) 36 | self._model.eval() 37 | 38 | @property 39 | def model_id(self) -> str: 40 | """ 41 | Returns the identifier of the pre-trained transformer model to use. 42 | 43 | Returns: 44 | str: The identifier of the pre-trained transformer model to use. 45 | """ 46 | 47 | return self._model_id 48 | 49 | @cached_property 50 | def embedding_size(self) -> int: 51 | """ 52 | Returns the size of the embeddings generated by the pre-trained transformer model. 53 | 54 | Returns: 55 | int: The size of the embeddings generated by the pre-trained transformer model. 56 | """ 57 | 58 | dummy_embedding = self._model.encode("") 59 | 60 | return dummy_embedding.shape[0] 61 | 62 | @property 63 | def max_input_length(self) -> int: 64 | """ 65 | Returns the maximum length of input text to tokenize. 66 | 67 | Returns: 68 | int: The maximum length of input text to tokenize. 69 | """ 70 | 71 | return self._model.max_seq_length 72 | 73 | @property 74 | def tokenizer(self) -> AutoTokenizer: 75 | """ 76 | Returns the tokenizer used to tokenize input text. 77 | 78 | Returns: 79 | AutoTokenizer: The tokenizer used to tokenize input text. 80 | """ 81 | 82 | return self._model.tokenizer 83 | 84 | def __call__( 85 | self, input_text: str | list[str], to_list: bool = True 86 | ) -> NDArray[np.float32] | list[float] | list[list[float]]: 87 | """ 88 | Generates embeddings for the input text using the pre-trained transformer model. 89 | 90 | Args: 91 | input_text (str): The input text to generate embeddings for. 92 | to_list (bool): Whether to return the embeddings as a list or numpy array. Defaults to True. 93 | 94 | Returns: 95 | Union[np.ndarray, list]: The embeddings generated for the input text. 96 | """ 97 | 98 | try: 99 | embeddings = self._model.encode(input_text) 100 | except Exception: 101 | logger.error(f"Error generating embeddings for {self._model_id=} and {input_text=}") 102 | 103 | return [] if to_list else np.array([]) 104 | 105 | if to_list: 106 | embeddings = embeddings.tolist() 107 | 108 | return embeddings 109 | 110 | 111 | class CrossEncoderModelSingleton(metaclass=SingletonMeta): 112 | def __init__( 113 | self, 114 | model_id: str = settings.RERANKING_CROSS_ENCODER_MODEL_ID, 115 | device: str = settings.RAG_MODEL_DEVICE, 116 | ) -> None: 117 | """ 118 | A singleton class that provides a pre-trained cross-encoder model for scoring pairs of input text. 119 | """ 120 | 121 | self._model_id = model_id 122 | self._device = device 123 | 124 | self._model = CrossEncoder( 125 | model_name=self._model_id, 126 | device=self._device, 127 | ) 128 | self._model.model.eval() 129 | 130 | def __call__(self, pairs: list[tuple[str, str]], to_list: bool = True) -> NDArray[np.float32] | list[float]: 131 | scores = self._model.predict(pairs) 132 | 133 | if to_list: 134 | scores = scores.tolist() 135 | 136 | return scores 137 | -------------------------------------------------------------------------------- /llm_engineering/application/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from .dispatchers import ChunkingDispatcher, CleaningDispatcher, EmbeddingDispatcher 2 | 3 | __all__ = ["CleaningDispatcher", "ChunkingDispatcher", "EmbeddingDispatcher"] 4 | -------------------------------------------------------------------------------- /llm_engineering/application/preprocessing/chunking_data_handlers.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from abc import ABC, abstractmethod 3 | from typing import Generic, TypeVar 4 | from uuid import UUID 5 | 6 | from llm_engineering.domain.chunks import ArticleChunk, Chunk, PostChunk, RepositoryChunk 7 | from llm_engineering.domain.cleaned_documents import ( 8 | CleanedArticleDocument, 9 | CleanedDocument, 10 | CleanedPostDocument, 11 | CleanedRepositoryDocument, 12 | ) 13 | 14 | from .operations import chunk_article, chunk_text 15 | 16 | CleanedDocumentT = TypeVar("CleanedDocumentT", bound=CleanedDocument) 17 | ChunkT = TypeVar("ChunkT", bound=Chunk) 18 | 19 | 20 | class ChunkingDataHandler(ABC, Generic[CleanedDocumentT, ChunkT]): 21 | """ 22 | Abstract class for all Chunking data handlers. 23 | All data transformations logic for the chunking step is done here 24 | """ 25 | 26 | @property 27 | def metadata(self) -> dict: 28 | return { 29 | "chunk_size": 500, 30 | "chunk_overlap": 50, 31 | } 32 | 33 | @abstractmethod 34 | def chunk(self, data_model: CleanedDocumentT) -> list[ChunkT]: 35 | pass 36 | 37 | 38 | class PostChunkingHandler(ChunkingDataHandler): 39 | @property 40 | def metadata(self) -> dict: 41 | return { 42 | "chunk_size": 250, 43 | "chunk_overlap": 25, 44 | } 45 | 46 | def chunk(self, data_model: CleanedPostDocument) -> list[PostChunk]: 47 | data_models_list = [] 48 | 49 | cleaned_content = data_model.content 50 | chunks = chunk_text( 51 | cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"] 52 | ) 53 | 54 | for chunk in chunks: 55 | chunk_id = hashlib.md5(chunk.encode()).hexdigest() 56 | model = PostChunk( 57 | id=UUID(chunk_id, version=4), 58 | content=chunk, 59 | platform=data_model.platform, 60 | document_id=data_model.id, 61 | author_id=data_model.author_id, 62 | author_full_name=data_model.author_full_name, 63 | image=data_model.image if data_model.image else None, 64 | metadata=self.metadata, 65 | ) 66 | data_models_list.append(model) 67 | 68 | return data_models_list 69 | 70 | 71 | class ArticleChunkingHandler(ChunkingDataHandler): 72 | @property 73 | def metadata(self) -> dict: 74 | return { 75 | "min_length": 1000, 76 | "max_length": 2000, 77 | } 78 | 79 | def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]: 80 | data_models_list = [] 81 | 82 | cleaned_content = data_model.content 83 | chunks = chunk_article( 84 | cleaned_content, min_length=self.metadata["min_length"], max_length=self.metadata["max_length"] 85 | ) 86 | 87 | for chunk in chunks: 88 | chunk_id = hashlib.md5(chunk.encode()).hexdigest() 89 | model = ArticleChunk( 90 | id=UUID(chunk_id, version=4), 91 | content=chunk, 92 | platform=data_model.platform, 93 | link=data_model.link, 94 | document_id=data_model.id, 95 | author_id=data_model.author_id, 96 | author_full_name=data_model.author_full_name, 97 | metadata=self.metadata, 98 | ) 99 | data_models_list.append(model) 100 | 101 | return data_models_list 102 | 103 | 104 | class RepositoryChunkingHandler(ChunkingDataHandler): 105 | @property 106 | def metadata(self) -> dict: 107 | return { 108 | "chunk_size": 1500, 109 | "chunk_overlap": 100, 110 | } 111 | 112 | def chunk(self, data_model: CleanedRepositoryDocument) -> list[RepositoryChunk]: 113 | data_models_list = [] 114 | 115 | cleaned_content = data_model.content 116 | chunks = chunk_text( 117 | cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"] 118 | ) 119 | 120 | for chunk in chunks: 121 | chunk_id = hashlib.md5(chunk.encode()).hexdigest() 122 | model = RepositoryChunk( 123 | id=UUID(chunk_id, version=4), 124 | content=chunk, 125 | platform=data_model.platform, 126 | name=data_model.name, 127 | link=data_model.link, 128 | document_id=data_model.id, 129 | author_id=data_model.author_id, 130 | author_full_name=data_model.author_full_name, 131 | metadata=self.metadata, 132 | ) 133 | data_models_list.append(model) 134 | 135 | return data_models_list 136 | -------------------------------------------------------------------------------- /llm_engineering/application/preprocessing/cleaning_data_handlers.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Generic, TypeVar 3 | 4 | from llm_engineering.domain.cleaned_documents import ( 5 | CleanedArticleDocument, 6 | CleanedDocument, 7 | CleanedPostDocument, 8 | CleanedRepositoryDocument, 9 | ) 10 | from llm_engineering.domain.documents import ( 11 | ArticleDocument, 12 | Document, 13 | PostDocument, 14 | RepositoryDocument, 15 | ) 16 | 17 | from .operations import clean_text 18 | 19 | DocumentT = TypeVar("DocumentT", bound=Document) 20 | CleanedDocumentT = TypeVar("CleanedDocumentT", bound=CleanedDocument) 21 | 22 | 23 | class CleaningDataHandler(ABC, Generic[DocumentT, CleanedDocumentT]): 24 | """ 25 | Abstract class for all cleaning data handlers. 26 | All data transformations logic for the cleaning step is done here 27 | """ 28 | 29 | @abstractmethod 30 | def clean(self, data_model: DocumentT) -> CleanedDocumentT: 31 | pass 32 | 33 | 34 | class PostCleaningHandler(CleaningDataHandler): 35 | def clean(self, data_model: PostDocument) -> CleanedPostDocument: 36 | return CleanedPostDocument( 37 | id=data_model.id, 38 | content=clean_text(" #### ".join(data_model.content.values())), 39 | platform=data_model.platform, 40 | author_id=data_model.author_id, 41 | author_full_name=data_model.author_full_name, 42 | image=data_model.image if data_model.image else None, 43 | ) 44 | 45 | 46 | class ArticleCleaningHandler(CleaningDataHandler): 47 | def clean(self, data_model: ArticleDocument) -> CleanedArticleDocument: 48 | valid_content = [content for content in data_model.content.values() if content] 49 | 50 | return CleanedArticleDocument( 51 | id=data_model.id, 52 | content=clean_text(" #### ".join(valid_content)), 53 | platform=data_model.platform, 54 | link=data_model.link, 55 | author_id=data_model.author_id, 56 | author_full_name=data_model.author_full_name, 57 | ) 58 | 59 | 60 | class RepositoryCleaningHandler(CleaningDataHandler): 61 | def clean(self, data_model: RepositoryDocument) -> CleanedRepositoryDocument: 62 | return CleanedRepositoryDocument( 63 | id=data_model.id, 64 | content=clean_text(" #### ".join(data_model.content.values())), 65 | platform=data_model.platform, 66 | name=data_model.name, 67 | link=data_model.link, 68 | author_id=data_model.author_id, 69 | author_full_name=data_model.author_full_name, 70 | ) 71 | -------------------------------------------------------------------------------- /llm_engineering/application/preprocessing/dispatchers.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | 3 | from llm_engineering.domain.base import NoSQLBaseDocument, VectorBaseDocument 4 | from llm_engineering.domain.types import DataCategory 5 | 6 | from .chunking_data_handlers import ( 7 | ArticleChunkingHandler, 8 | ChunkingDataHandler, 9 | PostChunkingHandler, 10 | RepositoryChunkingHandler, 11 | ) 12 | from .cleaning_data_handlers import ( 13 | ArticleCleaningHandler, 14 | CleaningDataHandler, 15 | PostCleaningHandler, 16 | RepositoryCleaningHandler, 17 | ) 18 | from .embedding_data_handlers import ( 19 | ArticleEmbeddingHandler, 20 | EmbeddingDataHandler, 21 | PostEmbeddingHandler, 22 | QueryEmbeddingHandler, 23 | RepositoryEmbeddingHandler, 24 | ) 25 | 26 | 27 | class CleaningHandlerFactory: 28 | @staticmethod 29 | def create_handler(data_category: DataCategory) -> CleaningDataHandler: 30 | if data_category == DataCategory.POSTS: 31 | return PostCleaningHandler() 32 | elif data_category == DataCategory.ARTICLES: 33 | return ArticleCleaningHandler() 34 | elif data_category == DataCategory.REPOSITORIES: 35 | return RepositoryCleaningHandler() 36 | else: 37 | raise ValueError("Unsupported data type") 38 | 39 | 40 | class CleaningDispatcher: 41 | factory = CleaningHandlerFactory() 42 | 43 | @classmethod 44 | def dispatch(cls, data_model: NoSQLBaseDocument) -> VectorBaseDocument: 45 | data_category = DataCategory(data_model.get_collection_name()) 46 | handler = cls.factory.create_handler(data_category) 47 | clean_model = handler.clean(data_model) 48 | 49 | logger.info( 50 | "Document cleaned successfully.", 51 | data_category=data_category, 52 | cleaned_content_len=len(clean_model.content), 53 | ) 54 | 55 | return clean_model 56 | 57 | 58 | class ChunkingHandlerFactory: 59 | @staticmethod 60 | def create_handler(data_category: DataCategory) -> ChunkingDataHandler: 61 | if data_category == DataCategory.POSTS: 62 | return PostChunkingHandler() 63 | elif data_category == DataCategory.ARTICLES: 64 | return ArticleChunkingHandler() 65 | elif data_category == DataCategory.REPOSITORIES: 66 | return RepositoryChunkingHandler() 67 | else: 68 | raise ValueError("Unsupported data type") 69 | 70 | 71 | class ChunkingDispatcher: 72 | factory = ChunkingHandlerFactory 73 | 74 | @classmethod 75 | def dispatch(cls, data_model: VectorBaseDocument) -> list[VectorBaseDocument]: 76 | data_category = data_model.get_category() 77 | handler = cls.factory.create_handler(data_category) 78 | chunk_models = handler.chunk(data_model) 79 | 80 | logger.info( 81 | "Document chunked successfully.", 82 | num=len(chunk_models), 83 | data_category=data_category, 84 | ) 85 | 86 | return chunk_models 87 | 88 | 89 | class EmbeddingHandlerFactory: 90 | @staticmethod 91 | def create_handler(data_category: DataCategory) -> EmbeddingDataHandler: 92 | if data_category == DataCategory.QUERIES: 93 | return QueryEmbeddingHandler() 94 | if data_category == DataCategory.POSTS: 95 | return PostEmbeddingHandler() 96 | elif data_category == DataCategory.ARTICLES: 97 | return ArticleEmbeddingHandler() 98 | elif data_category == DataCategory.REPOSITORIES: 99 | return RepositoryEmbeddingHandler() 100 | else: 101 | raise ValueError("Unsupported data type") 102 | 103 | 104 | class EmbeddingDispatcher: 105 | factory = EmbeddingHandlerFactory 106 | 107 | @classmethod 108 | def dispatch( 109 | cls, data_model: VectorBaseDocument | list[VectorBaseDocument] 110 | ) -> VectorBaseDocument | list[VectorBaseDocument]: 111 | is_list = isinstance(data_model, list) 112 | if not is_list: 113 | data_model = [data_model] 114 | 115 | if len(data_model) == 0: 116 | return [] 117 | 118 | data_category = data_model[0].get_category() 119 | assert all( 120 | data_model.get_category() == data_category for data_model in data_model 121 | ), "Data models must be of the same category." 122 | handler = cls.factory.create_handler(data_category) 123 | 124 | embedded_chunk_model = handler.embed_batch(data_model) 125 | 126 | if not is_list: 127 | embedded_chunk_model = embedded_chunk_model[0] 128 | 129 | logger.info( 130 | "Data embedded successfully.", 131 | data_category=data_category, 132 | ) 133 | 134 | return embedded_chunk_model 135 | -------------------------------------------------------------------------------- /llm_engineering/application/preprocessing/embedding_data_handlers.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Generic, TypeVar, cast 3 | 4 | from llm_engineering.application.networks import EmbeddingModelSingleton 5 | from llm_engineering.domain.chunks import ArticleChunk, Chunk, PostChunk, RepositoryChunk 6 | from llm_engineering.domain.embedded_chunks import ( 7 | EmbeddedArticleChunk, 8 | EmbeddedChunk, 9 | EmbeddedPostChunk, 10 | EmbeddedRepositoryChunk, 11 | ) 12 | from llm_engineering.domain.queries import EmbeddedQuery, Query 13 | 14 | ChunkT = TypeVar("ChunkT", bound=Chunk) 15 | EmbeddedChunkT = TypeVar("EmbeddedChunkT", bound=EmbeddedChunk) 16 | 17 | embedding_model = EmbeddingModelSingleton() 18 | 19 | 20 | class EmbeddingDataHandler(ABC, Generic[ChunkT, EmbeddedChunkT]): 21 | """ 22 | Abstract class for all embedding data handlers. 23 | All data transformations logic for the embedding step is done here 24 | """ 25 | 26 | def embed(self, data_model: ChunkT) -> EmbeddedChunkT: 27 | return self.embed_batch([data_model])[0] 28 | 29 | def embed_batch(self, data_model: list[ChunkT]) -> list[EmbeddedChunkT]: 30 | embedding_model_input = [data_model.content for data_model in data_model] 31 | embeddings = embedding_model(embedding_model_input, to_list=True) 32 | 33 | embedded_chunk = [ 34 | self.map_model(data_model, cast(list[float], embedding)) 35 | for data_model, embedding in zip(data_model, embeddings, strict=False) 36 | ] 37 | 38 | return embedded_chunk 39 | 40 | @abstractmethod 41 | def map_model(self, data_model: ChunkT, embedding: list[float]) -> EmbeddedChunkT: 42 | pass 43 | 44 | 45 | class QueryEmbeddingHandler(EmbeddingDataHandler): 46 | def map_model(self, data_model: Query, embedding: list[float]) -> EmbeddedQuery: 47 | return EmbeddedQuery( 48 | id=data_model.id, 49 | author_id=data_model.author_id, 50 | author_full_name=data_model.author_full_name, 51 | content=data_model.content, 52 | embedding=embedding, 53 | metadata={ 54 | "embedding_model_id": embedding_model.model_id, 55 | "embedding_size": embedding_model.embedding_size, 56 | "max_input_length": embedding_model.max_input_length, 57 | }, 58 | ) 59 | 60 | 61 | class PostEmbeddingHandler(EmbeddingDataHandler): 62 | def map_model(self, data_model: PostChunk, embedding: list[float]) -> EmbeddedPostChunk: 63 | return EmbeddedPostChunk( 64 | id=data_model.id, 65 | content=data_model.content, 66 | embedding=embedding, 67 | platform=data_model.platform, 68 | document_id=data_model.document_id, 69 | author_id=data_model.author_id, 70 | author_full_name=data_model.author_full_name, 71 | metadata={ 72 | "embedding_model_id": embedding_model.model_id, 73 | "embedding_size": embedding_model.embedding_size, 74 | "max_input_length": embedding_model.max_input_length, 75 | }, 76 | ) 77 | 78 | 79 | class ArticleEmbeddingHandler(EmbeddingDataHandler): 80 | def map_model(self, data_model: ArticleChunk, embedding: list[float]) -> EmbeddedArticleChunk: 81 | return EmbeddedArticleChunk( 82 | id=data_model.id, 83 | content=data_model.content, 84 | embedding=embedding, 85 | platform=data_model.platform, 86 | link=data_model.link, 87 | document_id=data_model.document_id, 88 | author_id=data_model.author_id, 89 | author_full_name=data_model.author_full_name, 90 | metadata={ 91 | "embedding_model_id": embedding_model.model_id, 92 | "embedding_size": embedding_model.embedding_size, 93 | "max_input_length": embedding_model.max_input_length, 94 | }, 95 | ) 96 | 97 | 98 | class RepositoryEmbeddingHandler(EmbeddingDataHandler): 99 | def map_model(self, data_model: RepositoryChunk, embedding: list[float]) -> EmbeddedRepositoryChunk: 100 | return EmbeddedRepositoryChunk( 101 | id=data_model.id, 102 | content=data_model.content, 103 | embedding=embedding, 104 | platform=data_model.platform, 105 | name=data_model.name, 106 | link=data_model.link, 107 | document_id=data_model.document_id, 108 | author_id=data_model.author_id, 109 | author_full_name=data_model.author_full_name, 110 | metadata={ 111 | "embedding_model_id": embedding_model.model_id, 112 | "embedding_size": embedding_model.embedding_size, 113 | "max_input_length": embedding_model.max_input_length, 114 | }, 115 | ) 116 | -------------------------------------------------------------------------------- /llm_engineering/application/preprocessing/operations/__init__.py: -------------------------------------------------------------------------------- 1 | from .chunking import chunk_article, chunk_text 2 | from .cleaning import clean_text 3 | 4 | __all__ = [ 5 | "chunk_article", 6 | "chunk_text", 7 | "clean_text", 8 | ] 9 | -------------------------------------------------------------------------------- /llm_engineering/application/preprocessing/operations/chunking.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter 4 | 5 | from llm_engineering.application.networks import EmbeddingModelSingleton 6 | 7 | embedding_model = EmbeddingModelSingleton() 8 | 9 | 10 | def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> list[str]: 11 | character_splitter = RecursiveCharacterTextSplitter(separators=["\n\n"], chunk_size=chunk_size, chunk_overlap=0) 12 | text_split_by_characters = character_splitter.split_text(text) 13 | 14 | token_splitter = SentenceTransformersTokenTextSplitter( 15 | chunk_overlap=chunk_overlap, 16 | tokens_per_chunk=embedding_model.max_input_length, 17 | model_name=embedding_model.model_id, 18 | ) 19 | chunks_by_tokens = [] 20 | for section in text_split_by_characters: 21 | chunks_by_tokens.extend(token_splitter.split_text(section)) 22 | 23 | return chunks_by_tokens 24 | 25 | 26 | def chunk_document(text: str, min_length: int, max_length: int) -> list[str]: 27 | """Alias for chunk_article().""" 28 | 29 | return chunk_article(text, min_length, max_length) 30 | 31 | 32 | def chunk_article(text: str, min_length: int, max_length: int) -> list[str]: 33 | sentences = re.split(r"(?= min_length: 46 | extracts.append(current_chunk.strip()) 47 | current_chunk = sentence + " " 48 | 49 | if len(current_chunk) >= min_length: 50 | extracts.append(current_chunk.strip()) 51 | 52 | return extracts 53 | -------------------------------------------------------------------------------- /llm_engineering/application/preprocessing/operations/cleaning.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def clean_text(text: str) -> str: 5 | text = re.sub(r"[^\w\s.,!?]", " ", text) 6 | text = re.sub(r"\s+", " ", text) 7 | 8 | return text.strip() 9 | -------------------------------------------------------------------------------- /llm_engineering/application/rag/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/application/rag/__init__.py -------------------------------------------------------------------------------- /llm_engineering/application/rag/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any 3 | 4 | from langchain.prompts import PromptTemplate 5 | from pydantic import BaseModel 6 | 7 | from llm_engineering.domain.queries import Query 8 | 9 | 10 | class PromptTemplateFactory(ABC, BaseModel): 11 | @abstractmethod 12 | def create_template(self) -> PromptTemplate: 13 | pass 14 | 15 | 16 | class RAGStep(ABC): 17 | def __init__(self, mock: bool = False) -> None: 18 | self._mock = mock 19 | 20 | @abstractmethod 21 | def generate(self, query: Query, *args, **kwargs) -> Any: 22 | pass 23 | -------------------------------------------------------------------------------- /llm_engineering/application/rag/prompt_templates.py: -------------------------------------------------------------------------------- 1 | from langchain.prompts import PromptTemplate 2 | 3 | from .base import PromptTemplateFactory 4 | 5 | 6 | class QueryExpansionTemplate(PromptTemplateFactory): 7 | prompt: str = """You are an AI language model assistant. Your task is to generate {expand_to_n} 8 | different versions of the given user question to retrieve relevant documents from a vector 9 | database. By generating multiple perspectives on the user question, your goal is to help 10 | the user overcome some of the limitations of the distance-based similarity search. 11 | Provide these alternative questions seperated by '{separator}'. 12 | Original question: {question}""" 13 | 14 | @property 15 | def separator(self) -> str: 16 | return "#next-question#" 17 | 18 | def create_template(self, expand_to_n: int) -> PromptTemplate: 19 | return PromptTemplate( 20 | template=self.prompt, 21 | input_variables=["question"], 22 | partial_variables={ 23 | "separator": self.separator, 24 | "expand_to_n": expand_to_n, 25 | }, 26 | ) 27 | 28 | 29 | class SelfQueryTemplate(PromptTemplateFactory): 30 | prompt: str = """You are an AI language model assistant. Your task is to extract information from a user question. 31 | The required information that needs to be extracted is the user name or user id. 32 | Your response should consists of only the extracted user name (e.g., John Doe) or id (e.g. 1345256), nothing else. 33 | If the user question does not contain any user name or id, you should return the following token: none. 34 | 35 | For example: 36 | QUESTION 1: 37 | My name is Paul Iusztin and I want a post about... 38 | RESPONSE 1: 39 | Paul Iusztin 40 | 41 | QUESTION 2: 42 | I want to write a post about... 43 | RESPONSE 2: 44 | none 45 | 46 | QUESTION 3: 47 | My user id is 1345256 and I want to write a post about... 48 | RESPONSE 3: 49 | 1345256 50 | 51 | User question: {question}""" 52 | 53 | def create_template(self) -> PromptTemplate: 54 | return PromptTemplate(template=self.prompt, input_variables=["question"]) 55 | -------------------------------------------------------------------------------- /llm_engineering/application/rag/query_expanison.py: -------------------------------------------------------------------------------- 1 | import opik 2 | from langchain_openai import ChatOpenAI 3 | from loguru import logger 4 | 5 | from llm_engineering.domain.queries import Query 6 | from llm_engineering.settings import settings 7 | 8 | from .base import RAGStep 9 | from .prompt_templates import QueryExpansionTemplate 10 | 11 | 12 | class QueryExpansion(RAGStep): 13 | @opik.track(name="QueryExpansion.generate") 14 | def generate(self, query: Query, expand_to_n: int) -> list[Query]: 15 | assert expand_to_n > 0, f"'expand_to_n' should be greater than 0. Got {expand_to_n}." 16 | 17 | if self._mock: 18 | return [query for _ in range(expand_to_n)] 19 | 20 | query_expansion_template = QueryExpansionTemplate() 21 | prompt = query_expansion_template.create_template(expand_to_n - 1) 22 | model = ChatOpenAI(model=settings.OPENAI_MODEL_ID, api_key=settings.OPENAI_API_KEY, temperature=0) 23 | 24 | chain = prompt | model 25 | 26 | response = chain.invoke({"question": query}) 27 | result = response.content 28 | 29 | queries_content = result.strip().split(query_expansion_template.separator) 30 | 31 | queries = [query] 32 | queries += [ 33 | query.replace_content(stripped_content) 34 | for content in queries_content 35 | if (stripped_content := content.strip()) 36 | ] 37 | 38 | return queries 39 | 40 | 41 | if __name__ == "__main__": 42 | query = Query.from_str("Write an article about the best types of advanced RAG methods.") 43 | query_expander = QueryExpansion() 44 | expanded_queries = query_expander.generate(query, expand_to_n=3) 45 | for expanded_query in expanded_queries: 46 | logger.info(expanded_query.content) 47 | -------------------------------------------------------------------------------- /llm_engineering/application/rag/reranking.py: -------------------------------------------------------------------------------- 1 | import opik 2 | 3 | from llm_engineering.application.networks import CrossEncoderModelSingleton 4 | from llm_engineering.domain.embedded_chunks import EmbeddedChunk 5 | from llm_engineering.domain.queries import Query 6 | 7 | from .base import RAGStep 8 | 9 | 10 | class Reranker(RAGStep): 11 | def __init__(self, mock: bool = False) -> None: 12 | super().__init__(mock=mock) 13 | 14 | self._model = CrossEncoderModelSingleton() 15 | 16 | @opik.track(name="Reranker.generate") 17 | def generate(self, query: Query, chunks: list[EmbeddedChunk], keep_top_k: int) -> list[EmbeddedChunk]: 18 | if self._mock: 19 | return chunks 20 | 21 | query_doc_tuples = [(query.content, chunk.content) for chunk in chunks] 22 | scores = self._model(query_doc_tuples) 23 | 24 | scored_query_doc_tuples = list(zip(scores, chunks, strict=False)) 25 | scored_query_doc_tuples.sort(key=lambda x: x[0], reverse=True) 26 | 27 | reranked_documents = scored_query_doc_tuples[:keep_top_k] 28 | reranked_documents = [doc for _, doc in reranked_documents] 29 | 30 | return reranked_documents 31 | -------------------------------------------------------------------------------- /llm_engineering/application/rag/retriever.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | 3 | import opik 4 | from loguru import logger 5 | from qdrant_client.models import FieldCondition, Filter, MatchValue 6 | 7 | from llm_engineering.application import utils 8 | from llm_engineering.application.preprocessing.dispatchers import EmbeddingDispatcher 9 | from llm_engineering.domain.embedded_chunks import ( 10 | EmbeddedArticleChunk, 11 | EmbeddedChunk, 12 | EmbeddedPostChunk, 13 | EmbeddedRepositoryChunk, 14 | ) 15 | from llm_engineering.domain.queries import EmbeddedQuery, Query 16 | 17 | from .query_expanison import QueryExpansion 18 | from .reranking import Reranker 19 | from .self_query import SelfQuery 20 | 21 | 22 | class ContextRetriever: 23 | def __init__(self, mock: bool = False) -> None: 24 | self._query_expander = QueryExpansion(mock=mock) 25 | self._metadata_extractor = SelfQuery(mock=mock) 26 | self._reranker = Reranker(mock=mock) 27 | 28 | @opik.track(name="ContextRetriever.search") 29 | def search( 30 | self, 31 | query: str, 32 | k: int = 3, 33 | expand_to_n_queries: int = 3, 34 | ) -> list: 35 | query_model = Query.from_str(query) 36 | 37 | query_model = self._metadata_extractor.generate(query_model) 38 | logger.info( 39 | f"Successfully extracted the author_full_name = {query_model.author_full_name} from the query.", 40 | ) 41 | 42 | n_generated_queries = self._query_expander.generate(query_model, expand_to_n=expand_to_n_queries) 43 | logger.info( 44 | f"Successfully generated {len(n_generated_queries)} search queries.", 45 | ) 46 | 47 | with concurrent.futures.ThreadPoolExecutor() as executor: 48 | search_tasks = [executor.submit(self._search, _query_model, k) for _query_model in n_generated_queries] 49 | 50 | n_k_documents = [task.result() for task in concurrent.futures.as_completed(search_tasks)] 51 | n_k_documents = utils.misc.flatten(n_k_documents) 52 | n_k_documents = list(set(n_k_documents)) 53 | 54 | logger.info(f"{len(n_k_documents)} documents retrieved successfully") 55 | 56 | if len(n_k_documents) > 0: 57 | k_documents = self.rerank(query, chunks=n_k_documents, keep_top_k=k) 58 | else: 59 | k_documents = [] 60 | 61 | return k_documents 62 | 63 | def _search(self, query: Query, k: int = 3) -> list[EmbeddedChunk]: 64 | assert k >= 3, "k should be >= 3" 65 | 66 | def _search_data_category( 67 | data_category_odm: type[EmbeddedChunk], embedded_query: EmbeddedQuery 68 | ) -> list[EmbeddedChunk]: 69 | if embedded_query.author_id: 70 | query_filter = Filter( 71 | must=[ 72 | FieldCondition( 73 | key="author_id", 74 | match=MatchValue( 75 | value=str(embedded_query.author_id), 76 | ), 77 | ) 78 | ] 79 | ) 80 | else: 81 | query_filter = None 82 | 83 | return data_category_odm.search( 84 | query_vector=embedded_query.embedding, 85 | limit=k // 3, 86 | query_filter=query_filter, 87 | ) 88 | 89 | embedded_query: EmbeddedQuery = EmbeddingDispatcher.dispatch(query) 90 | 91 | post_chunks = _search_data_category(EmbeddedPostChunk, embedded_query) 92 | articles_chunks = _search_data_category(EmbeddedArticleChunk, embedded_query) 93 | repositories_chunks = _search_data_category(EmbeddedRepositoryChunk, embedded_query) 94 | 95 | retrieved_chunks = post_chunks + articles_chunks + repositories_chunks 96 | 97 | return retrieved_chunks 98 | 99 | def rerank(self, query: str | Query, chunks: list[EmbeddedChunk], keep_top_k: int) -> list[EmbeddedChunk]: 100 | if isinstance(query, str): 101 | query = Query.from_str(query) 102 | 103 | reranked_documents = self._reranker.generate(query=query, chunks=chunks, keep_top_k=keep_top_k) 104 | 105 | logger.info(f"{len(reranked_documents)} documents reranked successfully.") 106 | 107 | return reranked_documents 108 | -------------------------------------------------------------------------------- /llm_engineering/application/rag/self_query.py: -------------------------------------------------------------------------------- 1 | import opik 2 | from langchain_openai import ChatOpenAI 3 | from loguru import logger 4 | 5 | from llm_engineering.application import utils 6 | from llm_engineering.domain.documents import UserDocument 7 | from llm_engineering.domain.queries import Query 8 | from llm_engineering.settings import settings 9 | 10 | from .base import RAGStep 11 | from .prompt_templates import SelfQueryTemplate 12 | 13 | 14 | class SelfQuery(RAGStep): 15 | @opik.track(name="SelfQuery.generate") 16 | def generate(self, query: Query) -> Query: 17 | if self._mock: 18 | return query 19 | 20 | prompt = SelfQueryTemplate().create_template() 21 | model = ChatOpenAI(model=settings.OPENAI_MODEL_ID, api_key=settings.OPENAI_API_KEY, temperature=0) 22 | 23 | chain = prompt | model 24 | 25 | response = chain.invoke({"question": query}) 26 | user_full_name = response.content.strip("\n ") 27 | 28 | if user_full_name == "none": 29 | return query 30 | 31 | first_name, last_name = utils.split_user_full_name(user_full_name) 32 | user = UserDocument.get_or_create(first_name=first_name, last_name=last_name) 33 | 34 | query.author_id = user.id 35 | query.author_full_name = user.full_name 36 | 37 | return query 38 | 39 | 40 | if __name__ == "__main__": 41 | query = Query.from_str("I am Paul Iusztin. Write an article about the best types of advanced RAG methods.") 42 | self_query = SelfQuery() 43 | query = self_query.generate(query) 44 | logger.info(f"Extracted author_id: {query.author_id}") 45 | logger.info(f"Extracted author_full_name: {query.author_full_name}") 46 | -------------------------------------------------------------------------------- /llm_engineering/application/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import misc 2 | from .split_user_full_name import split_user_full_name 3 | 4 | __all__ = ["misc", "split_user_full_name"] 5 | -------------------------------------------------------------------------------- /llm_engineering/application/utils/misc.py: -------------------------------------------------------------------------------- 1 | from typing import Generator 2 | 3 | from transformers import AutoTokenizer 4 | 5 | from llm_engineering.settings import settings 6 | 7 | 8 | def flatten(nested_list: list) -> list: 9 | """Flatten a list of lists into a single list.""" 10 | 11 | return [item for sublist in nested_list for item in sublist] 12 | 13 | 14 | def batch(list_: list, size: int) -> Generator[list, None, None]: 15 | yield from (list_[i : i + size] for i in range(0, len(list_), size)) 16 | 17 | 18 | def compute_num_tokens(text: str) -> int: 19 | tokenizer = AutoTokenizer.from_pretrained(settings.HF_MODEL_ID) 20 | 21 | return len(tokenizer.encode(text, add_special_tokens=False)) 22 | -------------------------------------------------------------------------------- /llm_engineering/application/utils/split_user_full_name.py: -------------------------------------------------------------------------------- 1 | from llm_engineering.domain.exceptions import ImproperlyConfigured 2 | 3 | 4 | def split_user_full_name(user: str | None) -> tuple[str, str]: 5 | if user is None: 6 | raise ImproperlyConfigured("User name is empty") 7 | 8 | name_tokens = user.split(" ") 9 | if len(name_tokens) == 0: 10 | raise ImproperlyConfigured("User name is empty") 11 | elif len(name_tokens) == 1: 12 | first_name, last_name = name_tokens[0], name_tokens[0] 13 | else: 14 | first_name, last_name = " ".join(name_tokens[:-1]), name_tokens[-1] 15 | 16 | return first_name, last_name 17 | -------------------------------------------------------------------------------- /llm_engineering/domain/__init__.py: -------------------------------------------------------------------------------- 1 | from . import base, chunks, cleaned_documents, dataset, documents, embedded_chunks, exceptions, inference, prompt, types 2 | 3 | __all__ = [ 4 | "base", 5 | "chunks", 6 | "cleaned_documents", 7 | "dataset", 8 | "documents", 9 | "embedded_chunks", 10 | "exceptions", 11 | "inference", 12 | "types", 13 | "prompt", 14 | ] 15 | -------------------------------------------------------------------------------- /llm_engineering/domain/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .nosql import NoSQLBaseDocument 2 | from .vector import VectorBaseDocument 3 | 4 | __all__ = ["NoSQLBaseDocument", "VectorBaseDocument"] 5 | -------------------------------------------------------------------------------- /llm_engineering/domain/base/nosql.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from abc import ABC 3 | from typing import Generic, Type, TypeVar 4 | 5 | from loguru import logger 6 | from pydantic import UUID4, BaseModel, Field 7 | from pymongo import errors 8 | 9 | from llm_engineering.domain.exceptions import ImproperlyConfigured 10 | from llm_engineering.infrastructure.db.mongo import connection 11 | from llm_engineering.settings import settings 12 | 13 | _database = connection.get_database(settings.DATABASE_NAME) 14 | 15 | 16 | T = TypeVar("T", bound="NoSQLBaseDocument") 17 | 18 | 19 | class NoSQLBaseDocument(BaseModel, Generic[T], ABC): 20 | id: UUID4 = Field(default_factory=uuid.uuid4) 21 | 22 | def __eq__(self, value: object) -> bool: 23 | if not isinstance(value, self.__class__): 24 | return False 25 | 26 | return self.id == value.id 27 | 28 | def __hash__(self) -> int: 29 | return hash(self.id) 30 | 31 | @classmethod 32 | def from_mongo(cls: Type[T], data: dict) -> T: 33 | """Convert "_id" (str object) into "id" (UUID object).""" 34 | 35 | if not data: 36 | raise ValueError("Data is empty.") 37 | 38 | id = data.pop("_id") 39 | 40 | return cls(**dict(data, id=id)) 41 | 42 | def to_mongo(self: T, **kwargs) -> dict: 43 | """Convert "id" (UUID object) into "_id" (str object).""" 44 | exclude_unset = kwargs.pop("exclude_unset", False) 45 | by_alias = kwargs.pop("by_alias", True) 46 | 47 | parsed = self.model_dump(exclude_unset=exclude_unset, by_alias=by_alias, **kwargs) 48 | 49 | if "_id" not in parsed and "id" in parsed: 50 | parsed["_id"] = str(parsed.pop("id")) 51 | 52 | for key, value in parsed.items(): 53 | if isinstance(value, uuid.UUID): 54 | parsed[key] = str(value) 55 | 56 | return parsed 57 | 58 | def model_dump(self: T, **kwargs) -> dict: 59 | dict_ = super().model_dump(**kwargs) 60 | 61 | for key, value in dict_.items(): 62 | if isinstance(value, uuid.UUID): 63 | dict_[key] = str(value) 64 | 65 | return dict_ 66 | 67 | def save(self: T, **kwargs) -> T | None: 68 | collection = _database[self.get_collection_name()] 69 | try: 70 | collection.insert_one(self.to_mongo(**kwargs)) 71 | 72 | return self 73 | except errors.WriteError: 74 | logger.exception("Failed to insert document.") 75 | 76 | return None 77 | 78 | @classmethod 79 | def get_or_create(cls: Type[T], **filter_options) -> T: 80 | collection = _database[cls.get_collection_name()] 81 | try: 82 | instance = collection.find_one(filter_options) 83 | if instance: 84 | return cls.from_mongo(instance) 85 | 86 | new_instance = cls(**filter_options) 87 | new_instance = new_instance.save() 88 | 89 | return new_instance 90 | except errors.OperationFailure: 91 | logger.exception(f"Failed to retrieve document with filter options: {filter_options}") 92 | 93 | raise 94 | 95 | @classmethod 96 | def bulk_insert(cls: Type[T], documents: list[T], **kwargs) -> bool: 97 | collection = _database[cls.get_collection_name()] 98 | try: 99 | collection.insert_many(doc.to_mongo(**kwargs) for doc in documents) 100 | 101 | return True 102 | except (errors.WriteError, errors.BulkWriteError): 103 | logger.error(f"Failed to insert documents of type {cls.__name__}") 104 | 105 | return False 106 | 107 | @classmethod 108 | def find(cls: Type[T], **filter_options) -> T | None: 109 | collection = _database[cls.get_collection_name()] 110 | try: 111 | instance = collection.find_one(filter_options) 112 | if instance: 113 | return cls.from_mongo(instance) 114 | 115 | return None 116 | except errors.OperationFailure: 117 | logger.error("Failed to retrieve document") 118 | 119 | return None 120 | 121 | @classmethod 122 | def bulk_find(cls: Type[T], **filter_options) -> list[T]: 123 | collection = _database[cls.get_collection_name()] 124 | try: 125 | instances = collection.find(filter_options) 126 | return [document for instance in instances if (document := cls.from_mongo(instance)) is not None] 127 | except errors.OperationFailure: 128 | logger.error("Failed to retrieve documents") 129 | 130 | return [] 131 | 132 | @classmethod 133 | def get_collection_name(cls: Type[T]) -> str: 134 | if not hasattr(cls, "Settings") or not hasattr(cls.Settings, "name"): 135 | raise ImproperlyConfigured( 136 | "Document should define an Settings configuration class with the name of the collection." 137 | ) 138 | 139 | return cls.Settings.name 140 | -------------------------------------------------------------------------------- /llm_engineering/domain/chunks.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Optional 3 | 4 | from pydantic import UUID4, Field 5 | 6 | from llm_engineering.domain.base import VectorBaseDocument 7 | from llm_engineering.domain.types import DataCategory 8 | 9 | 10 | class Chunk(VectorBaseDocument, ABC): 11 | content: str 12 | platform: str 13 | document_id: UUID4 14 | author_id: UUID4 15 | author_full_name: str 16 | metadata: dict = Field(default_factory=dict) 17 | 18 | 19 | class PostChunk(Chunk): 20 | image: Optional[str] = None 21 | 22 | class Config: 23 | category = DataCategory.POSTS 24 | 25 | 26 | class ArticleChunk(Chunk): 27 | link: str 28 | 29 | class Config: 30 | category = DataCategory.ARTICLES 31 | 32 | 33 | class RepositoryChunk(Chunk): 34 | name: str 35 | link: str 36 | 37 | class Config: 38 | category = DataCategory.REPOSITORIES 39 | -------------------------------------------------------------------------------- /llm_engineering/domain/cleaned_documents.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Optional 3 | 4 | from pydantic import UUID4 5 | 6 | from .base import VectorBaseDocument 7 | from .types import DataCategory 8 | 9 | 10 | class CleanedDocument(VectorBaseDocument, ABC): 11 | content: str 12 | platform: str 13 | author_id: UUID4 14 | author_full_name: str 15 | 16 | 17 | class CleanedPostDocument(CleanedDocument): 18 | image: Optional[str] = None 19 | 20 | class Config: 21 | name = "cleaned_posts" 22 | category = DataCategory.POSTS 23 | use_vector_index = False 24 | 25 | 26 | class CleanedArticleDocument(CleanedDocument): 27 | link: str 28 | 29 | class Config: 30 | name = "cleaned_articles" 31 | category = DataCategory.ARTICLES 32 | use_vector_index = False 33 | 34 | 35 | class CleanedRepositoryDocument(CleanedDocument): 36 | name: str 37 | link: str 38 | 39 | class Config: 40 | name = "cleaned_repositories" 41 | category = DataCategory.REPOSITORIES 42 | use_vector_index = False 43 | -------------------------------------------------------------------------------- /llm_engineering/domain/dataset.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | from loguru import logger 4 | 5 | try: 6 | from datasets import Dataset, DatasetDict, concatenate_datasets 7 | except ImportError: 8 | logger.warning("Huggingface datasets not installed. Install with `pip install datasets`") 9 | 10 | 11 | from llm_engineering.domain.base import VectorBaseDocument 12 | from llm_engineering.domain.types import DataCategory 13 | 14 | 15 | class DatasetType(Enum): 16 | INSTRUCTION = "instruction" 17 | PREFERENCE = "preference" 18 | 19 | 20 | class InstructDatasetSample(VectorBaseDocument): 21 | instruction: str 22 | answer: str 23 | 24 | class Config: 25 | category = DataCategory.INSTRUCT_DATASET_SAMPLES 26 | 27 | 28 | class PreferenceDatasetSample(VectorBaseDocument): 29 | instruction: str 30 | rejected: str 31 | chosen: str 32 | 33 | class Config: 34 | category = DataCategory.PREFERENCE_DATASET_SAMPLES 35 | 36 | 37 | class InstructDataset(VectorBaseDocument): 38 | category: DataCategory 39 | samples: list[InstructDatasetSample] 40 | 41 | class Config: 42 | category = DataCategory.INSTRUCT_DATASET 43 | 44 | @property 45 | def num_samples(self) -> int: 46 | return len(self.samples) 47 | 48 | def to_huggingface(self) -> "Dataset": 49 | data = [sample.model_dump() for sample in self.samples] 50 | 51 | return Dataset.from_dict( 52 | {"instruction": [d["instruction"] for d in data], "output": [d["answer"] for d in data]} 53 | ) 54 | 55 | 56 | class TrainTestSplit(VectorBaseDocument): 57 | train: dict 58 | test: dict 59 | test_split_size: float 60 | 61 | def to_huggingface(self, flatten: bool = False) -> "DatasetDict": 62 | train_datasets = {category.value: dataset.to_huggingface() for category, dataset in self.train.items()} 63 | test_datasets = {category.value: dataset.to_huggingface() for category, dataset in self.test.items()} 64 | 65 | if flatten: 66 | train_datasets = concatenate_datasets(list(train_datasets.values())) 67 | test_datasets = concatenate_datasets(list(test_datasets.values())) 68 | else: 69 | train_datasets = Dataset.from_dict(train_datasets) 70 | test_datasets = Dataset.from_dict(test_datasets) 71 | 72 | return DatasetDict({"train": train_datasets, "test": test_datasets}) 73 | 74 | 75 | class InstructTrainTestSplit(TrainTestSplit): 76 | train: dict[DataCategory, InstructDataset] 77 | test: dict[DataCategory, InstructDataset] 78 | test_split_size: float 79 | 80 | class Config: 81 | category = DataCategory.INSTRUCT_DATASET 82 | 83 | 84 | class PreferenceDataset(VectorBaseDocument): 85 | category: DataCategory 86 | samples: list[PreferenceDatasetSample] 87 | 88 | class Config: 89 | category = DataCategory.PREFERENCE_DATASET 90 | 91 | @property 92 | def num_samples(self) -> int: 93 | return len(self.samples) 94 | 95 | def to_huggingface(self) -> "Dataset": 96 | data = [sample.model_dump() for sample in self.samples] 97 | 98 | return Dataset.from_dict( 99 | { 100 | "prompt": [d["instruction"] for d in data], 101 | "rejected": [d["rejected"] for d in data], 102 | "chosen": [d["chosen"] for d in data], 103 | } 104 | ) 105 | 106 | 107 | class PreferenceTrainTestSplit(TrainTestSplit): 108 | train: dict[DataCategory, PreferenceDataset] 109 | test: dict[DataCategory, PreferenceDataset] 110 | test_split_size: float 111 | 112 | class Config: 113 | category = DataCategory.PREFERENCE_DATASET 114 | 115 | 116 | def build_dataset(dataset_type, *args, **kwargs) -> InstructDataset | PreferenceDataset: 117 | if dataset_type == DatasetType.INSTRUCTION: 118 | return InstructDataset(*args, **kwargs) 119 | elif dataset_type == DatasetType.PREFERENCE: 120 | return PreferenceDataset(*args, **kwargs) 121 | else: 122 | raise ValueError(f"Invalid dataset type: {dataset_type}") 123 | -------------------------------------------------------------------------------- /llm_engineering/domain/documents.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Optional 3 | 4 | from pydantic import UUID4, Field 5 | 6 | from .base import NoSQLBaseDocument 7 | from .types import DataCategory 8 | 9 | 10 | class UserDocument(NoSQLBaseDocument): 11 | first_name: str 12 | last_name: str 13 | 14 | class Settings: 15 | name = "users" 16 | 17 | @property 18 | def full_name(self): 19 | return f"{self.first_name} {self.last_name}" 20 | 21 | 22 | class Document(NoSQLBaseDocument, ABC): 23 | content: dict 24 | platform: str 25 | author_id: UUID4 = Field(alias="author_id") 26 | author_full_name: str = Field(alias="author_full_name") 27 | 28 | 29 | class RepositoryDocument(Document): 30 | name: str 31 | link: str 32 | 33 | class Settings: 34 | name = DataCategory.REPOSITORIES 35 | 36 | 37 | class PostDocument(Document): 38 | image: Optional[str] = None 39 | link: str | None = None 40 | 41 | class Settings: 42 | name = DataCategory.POSTS 43 | 44 | 45 | class ArticleDocument(Document): 46 | link: str 47 | 48 | class Settings: 49 | name = DataCategory.ARTICLES 50 | -------------------------------------------------------------------------------- /llm_engineering/domain/embedded_chunks.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | from pydantic import UUID4, Field 4 | 5 | from llm_engineering.domain.types import DataCategory 6 | 7 | from .base import VectorBaseDocument 8 | 9 | 10 | class EmbeddedChunk(VectorBaseDocument, ABC): 11 | content: str 12 | embedding: list[float] | None 13 | platform: str 14 | document_id: UUID4 15 | author_id: UUID4 16 | author_full_name: str 17 | metadata: dict = Field(default_factory=dict) 18 | 19 | @classmethod 20 | def to_context(cls, chunks: list["EmbeddedChunk"]) -> str: 21 | context = "" 22 | for i, chunk in enumerate(chunks): 23 | context += f""" 24 | Chunk {i + 1}: 25 | Type: {chunk.__class__.__name__} 26 | Platform: {chunk.platform} 27 | Author: {chunk.author_full_name} 28 | Content: {chunk.content}\n 29 | """ 30 | 31 | return context 32 | 33 | 34 | class EmbeddedPostChunk(EmbeddedChunk): 35 | class Config: 36 | name = "embedded_posts" 37 | category = DataCategory.POSTS 38 | use_vector_index = True 39 | 40 | 41 | class EmbeddedArticleChunk(EmbeddedChunk): 42 | link: str 43 | 44 | class Config: 45 | name = "embedded_articles" 46 | category = DataCategory.ARTICLES 47 | use_vector_index = True 48 | 49 | 50 | class EmbeddedRepositoryChunk(EmbeddedChunk): 51 | name: str 52 | link: str 53 | 54 | class Config: 55 | name = "embedded_repositories" 56 | category = DataCategory.REPOSITORIES 57 | use_vector_index = True 58 | -------------------------------------------------------------------------------- /llm_engineering/domain/exceptions.py: -------------------------------------------------------------------------------- 1 | class LLMTwinException(Exception): 2 | pass 3 | 4 | 5 | class ImproperlyConfigured(LLMTwinException): 6 | pass 7 | -------------------------------------------------------------------------------- /llm_engineering/domain/inference.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class DeploymentStrategy(ABC): 5 | @abstractmethod 6 | def deploy(self, model, endpoint_name: str, endpoint_config_name: str) -> None: 7 | pass 8 | 9 | 10 | class Inference(ABC): 11 | """An abstract class for performing inference.""" 12 | 13 | def __init__(self): 14 | self.model = None 15 | 16 | @abstractmethod 17 | def set_payload(self, inputs, parameters=None): 18 | pass 19 | 20 | @abstractmethod 21 | def inference(self): 22 | pass 23 | -------------------------------------------------------------------------------- /llm_engineering/domain/prompt.py: -------------------------------------------------------------------------------- 1 | from llm_engineering.domain.base import VectorBaseDocument 2 | from llm_engineering.domain.cleaned_documents import CleanedDocument 3 | from llm_engineering.domain.types import DataCategory 4 | 5 | 6 | class Prompt(VectorBaseDocument): 7 | template: str 8 | input_variables: dict 9 | content: str 10 | num_tokens: int | None = None 11 | 12 | class Config: 13 | category = DataCategory.PROMPT 14 | 15 | 16 | class GenerateDatasetSamplesPrompt(Prompt): 17 | data_category: DataCategory 18 | document: CleanedDocument 19 | -------------------------------------------------------------------------------- /llm_engineering/domain/queries.py: -------------------------------------------------------------------------------- 1 | from pydantic import UUID4, Field 2 | 3 | from llm_engineering.domain.base import VectorBaseDocument 4 | from llm_engineering.domain.types import DataCategory 5 | 6 | 7 | class Query(VectorBaseDocument): 8 | content: str 9 | author_id: UUID4 | None = None 10 | author_full_name: str | None = None 11 | metadata: dict = Field(default_factory=dict) 12 | 13 | class Config: 14 | category = DataCategory.QUERIES 15 | 16 | @classmethod 17 | def from_str(cls, query: str) -> "Query": 18 | return Query(content=query.strip("\n ")) 19 | 20 | def replace_content(self, new_content: str) -> "Query": 21 | return Query( 22 | id=self.id, 23 | content=new_content, 24 | author_id=self.author_id, 25 | author_full_name=self.author_full_name, 26 | metadata=self.metadata, 27 | ) 28 | 29 | 30 | class EmbeddedQuery(Query): 31 | embedding: list[float] 32 | 33 | class Config: 34 | category = DataCategory.QUERIES 35 | -------------------------------------------------------------------------------- /llm_engineering/domain/types.py: -------------------------------------------------------------------------------- 1 | from enum import StrEnum 2 | 3 | 4 | class DataCategory(StrEnum): 5 | PROMPT = "prompt" 6 | QUERIES = "queries" 7 | 8 | INSTRUCT_DATASET_SAMPLES = "instruct_dataset_samples" 9 | INSTRUCT_DATASET = "instruct_dataset" 10 | PREFERENCE_DATASET_SAMPLES = "preference_dataset_samples" 11 | PREFERENCE_DATASET = "preference_dataset" 12 | 13 | POSTS = "posts" 14 | ARTICLES = "articles" 15 | REPOSITORIES = "repositories" 16 | -------------------------------------------------------------------------------- /llm_engineering/infrastructure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/infrastructure/__init__.py -------------------------------------------------------------------------------- /llm_engineering/infrastructure/aws/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/infrastructure/aws/__init__.py -------------------------------------------------------------------------------- /llm_engineering/infrastructure/aws/deploy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/infrastructure/aws/deploy/__init__.py -------------------------------------------------------------------------------- /llm_engineering/infrastructure/aws/deploy/delete_sagemaker_endpoint.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | 3 | try: 4 | import boto3 5 | from botocore.exceptions import ClientError 6 | except ModuleNotFoundError: 7 | logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.") 8 | 9 | 10 | from llm_engineering.settings import settings 11 | 12 | 13 | def delete_endpoint_and_config(endpoint_name) -> None: 14 | """ 15 | Deletes an AWS SageMaker endpoint and its associated configuration. 16 | Args: 17 | endpoint_name (str): The name of the SageMaker endpoint to delete. 18 | Returns: 19 | None 20 | """ 21 | 22 | try: 23 | sagemaker_client = boto3.client( 24 | "sagemaker", 25 | region_name=settings.AWS_REGION, 26 | aws_access_key_id=settings.AWS_ACCESS_KEY, 27 | aws_secret_access_key=settings.AWS_SECRET_KEY, 28 | ) 29 | except Exception: 30 | logger.exception("Error creating SageMaker client") 31 | 32 | return 33 | 34 | # Get the endpoint configuration name 35 | try: 36 | response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name) 37 | config_name = response["EndpointConfigName"] 38 | except ClientError: 39 | logger.error("Error getting endpoint configuration and modelname.") 40 | 41 | return 42 | 43 | # Delete the endpoint 44 | try: 45 | sagemaker_client.delete_endpoint(EndpointName=endpoint_name) 46 | logger.info(f"Endpoint '{endpoint_name}' deletion initiated.") 47 | except ClientError: 48 | logger.error("Error deleting endpoint") 49 | 50 | try: 51 | response = sagemaker_client.describe_endpoint_config(EndpointConfigName=endpoint_name) 52 | model_name = response["ProductionVariants"][0]["ModelName"] 53 | except ClientError: 54 | logger.error("Error getting model name.") 55 | 56 | # Delete the endpoint configuration 57 | try: 58 | sagemaker_client.delete_endpoint_config(EndpointConfigName=config_name) 59 | logger.info(f"Endpoint configuration '{config_name}' deleted.") 60 | except ClientError: 61 | logger.error("Error deleting endpoint configuration.") 62 | 63 | # Delete models 64 | try: 65 | sagemaker_client.delete_model(ModelName=model_name) 66 | logger.info(f"Model '{model_name}' deleted.") 67 | except ClientError: 68 | logger.error("Error deleting model.") 69 | 70 | 71 | if __name__ == "__main__": 72 | endpoint_name = settings.SAGEMAKER_ENDPOINT_INFERENCE 73 | logger.info(f"Attempting to delete endpoint: {endpoint_name}") 74 | delete_endpoint_and_config(endpoint_name=endpoint_name) 75 | -------------------------------------------------------------------------------- /llm_engineering/infrastructure/aws/deploy/huggingface/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/infrastructure/aws/deploy/huggingface/__init__.py -------------------------------------------------------------------------------- /llm_engineering/infrastructure/aws/deploy/huggingface/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from loguru import logger 4 | 5 | try: 6 | from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements 7 | except ModuleNotFoundError: 8 | logger.warning("Couldn't load SageMaker imports. Run 'poetry install --with aws' to support AWS.") 9 | 10 | from llm_engineering.settings import settings 11 | 12 | hugging_face_deploy_config = { 13 | "HF_MODEL_ID": settings.HF_MODEL_ID, 14 | "HUGGING_FACE_HUB_TOKEN": settings.HUGGINGFACE_ACCESS_TOKEN, 15 | "SM_NUM_GPUS": json.dumps(settings.SM_NUM_GPUS), # Number of GPU used per replica 16 | "MAX_INPUT_LENGTH": json.dumps(settings.MAX_INPUT_LENGTH), # Max length of input text 17 | "MAX_TOTAL_TOKENS": json.dumps(settings.MAX_TOTAL_TOKENS), # Max length of the generation (including input text) 18 | "MAX_BATCH_TOTAL_TOKENS": json.dumps(settings.MAX_BATCH_TOTAL_TOKENS), 19 | "MAX_BATCH_PREFILL_TOKENS": json.dumps(settings.MAX_BATCH_TOTAL_TOKENS), 20 | "HF_MODEL_QUANTIZE": "bitsandbytes", 21 | } 22 | 23 | 24 | model_resource_config = ResourceRequirements( 25 | requests={ 26 | "copies": settings.COPIES, # Number of replicas. 27 | "num_accelerators": settings.GPUS, # Number of GPUs required. 28 | "num_cpus": settings.CPUS, # Number of CPU cores required. 29 | "memory": 5 * 1024, # Minimum memory required in Mb (required) 30 | }, 31 | ) 32 | -------------------------------------------------------------------------------- /llm_engineering/infrastructure/aws/deploy/huggingface/run.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | 3 | try: 4 | from sagemaker.enums import EndpointType 5 | from sagemaker.huggingface import get_huggingface_llm_image_uri 6 | except ModuleNotFoundError: 7 | logger.warning("Couldn't load SageMaker imports. Run 'poetry install --with aws' to support AWS.") 8 | 9 | from llm_engineering.model.utils import ResourceManager 10 | from llm_engineering.settings import settings 11 | 12 | from .config import hugging_face_deploy_config, model_resource_config 13 | from .sagemaker_huggingface import DeploymentService, SagemakerHuggingfaceStrategy 14 | 15 | 16 | def create_endpoint(endpoint_type=EndpointType.INFERENCE_COMPONENT_BASED) -> None: 17 | assert settings.AWS_ARN_ROLE is not None, "AWS_ARN_ROLE is not set in the .env file." 18 | 19 | logger.info(f"Creating endpoint with endpoint_type = {endpoint_type} and model_id = {settings.HF_MODEL_ID}") 20 | 21 | llm_image = get_huggingface_llm_image_uri("huggingface", version="2.2.0") 22 | 23 | resource_manager = ResourceManager() 24 | deployment_service = DeploymentService(resource_manager=resource_manager) 25 | 26 | SagemakerHuggingfaceStrategy(deployment_service).deploy( 27 | role_arn=settings.AWS_ARN_ROLE, 28 | llm_image=llm_image, 29 | config=hugging_face_deploy_config, 30 | endpoint_name=settings.SAGEMAKER_ENDPOINT_INFERENCE, 31 | endpoint_config_name=settings.SAGEMAKER_ENDPOINT_CONFIG_INFERENCE, 32 | gpu_instance_type=settings.GPU_INSTANCE_TYPE, 33 | resources=model_resource_config, 34 | endpoint_type=endpoint_type, 35 | ) 36 | 37 | 38 | if __name__ == "__main__": 39 | create_endpoint(endpoint_type=EndpointType.MODEL_BASED) 40 | -------------------------------------------------------------------------------- /llm_engineering/infrastructure/aws/deploy/huggingface/sagemaker_huggingface.py: -------------------------------------------------------------------------------- 1 | import enum 2 | from typing import Optional 3 | 4 | from loguru import logger 5 | 6 | try: 7 | import boto3 8 | from sagemaker.enums import EndpointType 9 | from sagemaker.huggingface import HuggingFaceModel 10 | except ModuleNotFoundError: 11 | logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.") 12 | 13 | from llm_engineering.domain.inference import DeploymentStrategy 14 | from llm_engineering.settings import settings 15 | 16 | 17 | class SagemakerHuggingfaceStrategy(DeploymentStrategy): 18 | def __init__(self, deployment_service) -> None: 19 | """ 20 | Initializes the deployment strategy with the necessary services. 21 | 22 | :param deployment_service: The service handling the deployment details. 23 | :param logger: Logger for logging information and errors. 24 | """ 25 | self.deployment_service = deployment_service 26 | 27 | def deploy( 28 | self, 29 | role_arn: str, 30 | llm_image: str, 31 | config: dict, 32 | endpoint_name: str, 33 | endpoint_config_name: str, 34 | gpu_instance_type: str, 35 | resources: Optional[dict] = None, 36 | endpoint_type: enum.Enum = EndpointType.MODEL_BASED, 37 | ) -> None: 38 | """ 39 | Initiates the deployment process for a HuggingFace model on AWS SageMaker. 40 | 41 | :param role_arn: AWS role ARN with permissions for SageMaker deployment. 42 | :param llm_image: URI for the HuggingFace model Docker image. 43 | :param config: Configuration settings for the model environment. 44 | :param endpoint_name: Name of the SageMaker endpoint. 45 | :param endpoint_config_name: Name of the SageMaker endpoint configuration. 46 | :param resources: Optional resources for the model deployment (used for multi model endpoints) 47 | :param endpoint_type: can be EndpointType.MODEL_BASED (without inference component) 48 | or EndpointType.INFERENCE_COMPONENT (with inference component) 49 | 50 | """ 51 | 52 | logger.info("Starting deployment using Sagemaker Huggingface Strategy...") 53 | logger.info( 54 | f"Deployment parameters: nb of replicas: {settings.COPIES}, nb of gpus:{settings.GPUS}, instance_type:{settings.GPU_INSTANCE_TYPE}" 55 | ) 56 | try: 57 | # Delegate to the deployment service to handle the actual deployment details 58 | self.deployment_service.deploy( 59 | role_arn=role_arn, 60 | llm_image=llm_image, 61 | config=config, 62 | endpoint_name=endpoint_name, 63 | endpoint_config_name=endpoint_config_name, 64 | gpu_instance_type=gpu_instance_type, 65 | resources=resources, 66 | endpoint_type=endpoint_type, 67 | ) 68 | logger.info("Deployment completed successfully.") 69 | except Exception as e: 70 | logger.error(f"Error during deployment: {e}") 71 | raise 72 | 73 | 74 | class DeploymentService: 75 | def __init__(self, resource_manager): 76 | """ 77 | Initializes the DeploymentService with necessary dependencies. 78 | 79 | :param resource_manager: Manages resources and configurations for deployments. 80 | :param settings: Configuration settings for deployment. 81 | :param logger: Optional logger for logging messages. If None, the standard logging module will be used. 82 | """ 83 | 84 | self.sagemaker_client = boto3.client( 85 | "sagemaker", 86 | region_name=settings.AWS_REGION, 87 | aws_access_key_id=settings.AWS_ACCESS_KEY, 88 | aws_secret_access_key=settings.AWS_SECRET_KEY, 89 | ) 90 | self.resource_manager = resource_manager 91 | 92 | def deploy( 93 | self, 94 | role_arn: str, 95 | llm_image: str, 96 | config: dict, 97 | endpoint_name: str, 98 | endpoint_config_name: str, 99 | gpu_instance_type: str, 100 | resources: Optional[dict] = None, 101 | endpoint_type: enum.Enum = EndpointType.MODEL_BASED, 102 | ) -> None: 103 | """ 104 | Handles the deployment of a model to SageMaker, including checking and creating 105 | configurations and endpoints as necessary. 106 | 107 | :param role_arn: The ARN of the IAM role for SageMaker to access resources. 108 | :param llm_image: URI of the Docker image in ECR for the HuggingFace model. 109 | :param config: Configuration dictionary for the environment variables of the model. 110 | :param endpoint_name: The name for the SageMaker endpoint. 111 | :param endpoint_config_name: The name for the SageMaker endpoint configuration. 112 | :param resources: Optional resources for the model deployment (used for multi model endpoints) 113 | :param endpoint_type: can be EndpointType.MODEL_BASED (without inference component) 114 | or EndpointType.INFERENCE_COMPONENT (with inference component) 115 | :param gpu_instance_type: The instance type for the SageMaker endpoint. 116 | """ 117 | 118 | try: 119 | # Check if the endpoint configuration exists 120 | if self.resource_manager.endpoint_config_exists(endpoint_config_name=endpoint_config_name): 121 | logger.info(f"Endpoint configuration {endpoint_config_name} exists. Using existing configuration...") 122 | else: 123 | logger.info(f"Endpoint configuration{endpoint_config_name} does not exist.") 124 | 125 | # Prepare and deploy the HuggingFace model 126 | self.prepare_and_deploy_model( 127 | role_arn=role_arn, 128 | llm_image=llm_image, 129 | config=config, 130 | endpoint_name=endpoint_name, 131 | update_endpoint=False, 132 | resources=resources, 133 | endpoint_type=endpoint_type, 134 | gpu_instance_type=gpu_instance_type, 135 | ) 136 | 137 | logger.info(f"Successfully deployed/updated model to endpoint {endpoint_name}.") 138 | except Exception as e: 139 | logger.error(f"Failed to deploy model to SageMaker: {e}") 140 | 141 | raise 142 | 143 | @staticmethod 144 | def prepare_and_deploy_model( 145 | role_arn: str, 146 | llm_image: str, 147 | config: dict, 148 | endpoint_name: str, 149 | update_endpoint: bool, 150 | gpu_instance_type: str, 151 | resources: Optional[dict] = None, 152 | endpoint_type: enum.Enum = EndpointType.MODEL_BASED, 153 | ) -> None: 154 | """ 155 | Prepares and deploys/updates the HuggingFace model on SageMaker. 156 | 157 | :param role_arn: The ARN of the IAM role. 158 | :param llm_image: The Docker image URI for the HuggingFace model. 159 | :param config: Configuration settings for the model. 160 | :param endpoint_name: The name of the endpoint. 161 | :param update_endpoint: Boolean flag to update an existing endpoint. 162 | :param gpu_instance_type: The instance type for the SageMaker endpoint. 163 | :param resources: Optional resources for the model deployment(used for multi model endpoints) 164 | :param endpoint_type: can be EndpointType.MODEL_BASED (without inference component) 165 | or EndpointType.INFERENCE_COMPONENT (with inference component) 166 | """ 167 | 168 | huggingface_model = HuggingFaceModel( 169 | role=role_arn, 170 | image_uri=llm_image, 171 | env=config, 172 | ) 173 | 174 | # Deploy or update the model based on the endpoint existence 175 | huggingface_model.deploy( 176 | instance_type=gpu_instance_type, 177 | initial_instance_count=1, 178 | endpoint_name=endpoint_name, 179 | update_endpoint=update_endpoint, 180 | resources=resources, 181 | tags=[{"Key": "task", "Value": "model_task"}], 182 | endpoint_type=endpoint_type, 183 | container_startup_health_check_timeout=900, 184 | ) 185 | -------------------------------------------------------------------------------- /llm_engineering/infrastructure/aws/roles/create_execution_role.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | from loguru import logger 5 | 6 | try: 7 | import boto3 8 | except ModuleNotFoundError: 9 | logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.") 10 | 11 | from llm_engineering.settings import settings 12 | 13 | 14 | def create_sagemaker_execution_role(role_name: str): 15 | assert settings.AWS_REGION, "AWS_REGION is not set." 16 | assert settings.AWS_ACCESS_KEY, "AWS_ACCESS_KEY is not set." 17 | assert settings.AWS_SECRET_KEY, "AWS_SECRET_KEY is not set." 18 | 19 | # Create IAM client 20 | iam = boto3.client( 21 | "iam", 22 | region_name=settings.AWS_REGION, 23 | aws_access_key_id=settings.AWS_ACCESS_KEY, 24 | aws_secret_access_key=settings.AWS_SECRET_KEY, 25 | ) 26 | 27 | # Define the trust relationship policy 28 | trust_relationship = { 29 | "Version": "2012-10-17", 30 | "Statement": [ 31 | {"Effect": "Allow", "Principal": {"Service": "sagemaker.amazonaws.com"}, "Action": "sts:AssumeRole"} 32 | ], 33 | } 34 | 35 | try: 36 | # Create the IAM role 37 | role = iam.create_role( 38 | RoleName=role_name, 39 | AssumeRolePolicyDocument=json.dumps(trust_relationship), 40 | Description="Execution role for SageMaker", 41 | ) 42 | 43 | # Attach necessary policies 44 | policies = [ 45 | "arn:aws:iam::aws:policy/AmazonSageMakerFullAccess", 46 | "arn:aws:iam::aws:policy/AmazonS3FullAccess", 47 | "arn:aws:iam::aws:policy/CloudWatchLogsFullAccess", 48 | "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess", 49 | ] 50 | 51 | for policy in policies: 52 | iam.attach_role_policy(RoleName=role_name, PolicyArn=policy) 53 | 54 | logger.info(f"Role '{role_name}' created successfully.") 55 | logger.info(f"Role ARN: {role['Role']['Arn']}") 56 | 57 | return role["Role"]["Arn"] 58 | 59 | except iam.exceptions.EntityAlreadyExistsException: 60 | logger.warning(f"Role '{role_name}' already exists. Fetching its ARN...") 61 | role = iam.get_role(RoleName=role_name) 62 | 63 | return role["Role"]["Arn"] 64 | 65 | 66 | if __name__ == "__main__": 67 | role_arn = create_sagemaker_execution_role("SageMakerExecutionRoleLLM") 68 | logger.info(role_arn) 69 | 70 | # Save the role ARN to a file 71 | with Path("sagemaker_execution_role.json").open("w") as f: 72 | json.dump({"RoleArn": role_arn}, f) 73 | 74 | logger.info("Role ARN saved to 'sagemaker_execution_role.json'") 75 | -------------------------------------------------------------------------------- /llm_engineering/infrastructure/aws/roles/create_sagemaker_role.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | from loguru import logger 5 | 6 | try: 7 | import boto3 8 | except ModuleNotFoundError: 9 | logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.") 10 | 11 | from llm_engineering.settings import settings 12 | 13 | 14 | def create_sagemaker_user(username: str): 15 | assert settings.AWS_REGION, "AWS_REGION is not set." 16 | assert settings.AWS_ACCESS_KEY, "AWS_ACCESS_KEY is not set." 17 | assert settings.AWS_SECRET_KEY, "AWS_SECRET_KEY is not set." 18 | 19 | # Create IAM client 20 | iam = boto3.client( 21 | "iam", 22 | region_name=settings.AWS_REGION, 23 | aws_access_key_id=settings.AWS_ACCESS_KEY, 24 | aws_secret_access_key=settings.AWS_SECRET_KEY, 25 | ) 26 | 27 | # Create user 28 | iam.create_user(UserName=username) 29 | 30 | # Attach necessary policies 31 | policies = [ 32 | "arn:aws:iam::aws:policy/AmazonSageMakerFullAccess", 33 | "arn:aws:iam::aws:policy/AWSCloudFormationFullAccess", 34 | "arn:aws:iam::aws:policy/IAMFullAccess", 35 | "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess", 36 | "arn:aws:iam::aws:policy/AmazonS3FullAccess", 37 | ] 38 | 39 | for policy in policies: 40 | iam.attach_user_policy(UserName=username, PolicyArn=policy) 41 | 42 | # Create access key 43 | response = iam.create_access_key(UserName=username) 44 | access_key = response["AccessKey"] 45 | 46 | logger.info(f"User '{username}' successfully created.") 47 | logger.info("Access Key ID and Secret Access Key successfully created.") 48 | 49 | return {"AccessKeyId": access_key["AccessKeyId"], "SecretAccessKey": access_key["SecretAccessKey"]} 50 | 51 | 52 | if __name__ == "__main__": 53 | new_user = create_sagemaker_user("sagemaker-deployer") 54 | 55 | with Path("sagemaker_user_credentials.json").open("w") as f: 56 | json.dump(new_user, f) 57 | 58 | logger.info("Credentials saved to 'sagemaker_user_credentials.json'") 59 | -------------------------------------------------------------------------------- /llm_engineering/infrastructure/db/mongo.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from pymongo import MongoClient 3 | from pymongo.errors import ConnectionFailure 4 | 5 | from llm_engineering.settings import settings 6 | 7 | 8 | class MongoDatabaseConnector: 9 | _instance: MongoClient | None = None 10 | 11 | def __new__(cls, *args, **kwargs) -> MongoClient: 12 | if cls._instance is None: 13 | try: 14 | cls._instance = MongoClient(settings.DATABASE_HOST) 15 | except ConnectionFailure as e: 16 | logger.error(f"Couldn't connect to the database: {e!s}") 17 | 18 | raise 19 | 20 | logger.info(f"Connection to MongoDB with URI successful: {settings.DATABASE_HOST}") 21 | 22 | return cls._instance 23 | 24 | 25 | connection = MongoDatabaseConnector() 26 | -------------------------------------------------------------------------------- /llm_engineering/infrastructure/db/qdrant.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from qdrant_client import QdrantClient 3 | from qdrant_client.http.exceptions import UnexpectedResponse 4 | 5 | from llm_engineering.settings import settings 6 | 7 | 8 | class QdrantDatabaseConnector: 9 | _instance: QdrantClient | None = None 10 | 11 | def __new__(cls, *args, **kwargs) -> QdrantClient: 12 | if cls._instance is None: 13 | try: 14 | if settings.USE_QDRANT_CLOUD: 15 | cls._instance = QdrantClient( 16 | url=settings.QDRANT_CLOUD_URL, 17 | api_key=settings.QDRANT_APIKEY, 18 | ) 19 | 20 | uri = settings.QDRANT_CLOUD_URL 21 | else: 22 | cls._instance = QdrantClient( 23 | host=settings.QDRANT_DATABASE_HOST, 24 | port=settings.QDRANT_DATABASE_PORT, 25 | ) 26 | 27 | uri = f"{settings.QDRANT_DATABASE_HOST}:{settings.QDRANT_DATABASE_PORT}" 28 | 29 | logger.info(f"Connection to Qdrant DB with URI successful: {uri}") 30 | except UnexpectedResponse: 31 | logger.exception( 32 | "Couldn't connect to Qdrant.", 33 | host=settings.QDRANT_DATABASE_HOST, 34 | port=settings.QDRANT_DATABASE_PORT, 35 | url=settings.QDRANT_CLOUD_URL, 36 | ) 37 | 38 | raise 39 | 40 | return cls._instance 41 | 42 | 43 | connection = QdrantDatabaseConnector() 44 | -------------------------------------------------------------------------------- /llm_engineering/infrastructure/files_io.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | 5 | class JsonFileManager: 6 | @classmethod 7 | def read(cls, filename: str | Path) -> list: 8 | file_path: Path = Path(filename) 9 | 10 | try: 11 | with file_path.open("r") as file: 12 | return json.load(file) 13 | except FileNotFoundError: 14 | raise FileNotFoundError(f"File '{file_path=}' does not exist.") from None 15 | except json.JSONDecodeError as e: 16 | raise json.JSONDecodeError( 17 | msg=f"File '{file_path=}' is not properly formatted as JSON.", 18 | doc=e.doc, 19 | pos=e.pos, 20 | ) from None 21 | 22 | @classmethod 23 | def write(cls, filename: str | Path, data: list | dict) -> Path: 24 | file_path: Path = Path(filename) 25 | file_path = file_path.resolve().absolute() 26 | file_path.parent.mkdir(parents=True, exist_ok=True) 27 | 28 | with file_path.open("w") as file: 29 | json.dump(data, file, indent=4) 30 | 31 | return file_path 32 | -------------------------------------------------------------------------------- /llm_engineering/infrastructure/inference_pipeline_api.py: -------------------------------------------------------------------------------- 1 | import opik 2 | from fastapi import FastAPI, HTTPException 3 | from opik import opik_context 4 | from pydantic import BaseModel 5 | 6 | from llm_engineering import settings 7 | from llm_engineering.application.rag.retriever import ContextRetriever 8 | from llm_engineering.application.utils import misc 9 | from llm_engineering.domain.embedded_chunks import EmbeddedChunk 10 | from llm_engineering.infrastructure.opik_utils import configure_opik 11 | from llm_engineering.model.inference import InferenceExecutor, LLMInferenceSagemakerEndpoint 12 | 13 | configure_opik() 14 | 15 | app = FastAPI() 16 | 17 | 18 | class QueryRequest(BaseModel): 19 | query: str 20 | 21 | 22 | class QueryResponse(BaseModel): 23 | answer: str 24 | 25 | 26 | @opik.track 27 | def call_llm_service(query: str, context: str | None) -> str: 28 | llm = LLMInferenceSagemakerEndpoint( 29 | endpoint_name=settings.SAGEMAKER_ENDPOINT_INFERENCE, inference_component_name=None 30 | ) 31 | answer = InferenceExecutor(llm, query, context).execute() 32 | 33 | return answer 34 | 35 | 36 | @opik.track 37 | def rag(query: str) -> str: 38 | retriever = ContextRetriever(mock=False) 39 | documents = retriever.search(query, k=3) 40 | context = EmbeddedChunk.to_context(documents) 41 | 42 | answer = call_llm_service(query, context) 43 | 44 | opik_context.update_current_trace( 45 | tags=["rag"], 46 | metadata={ 47 | "model_id": settings.HF_MODEL_ID, 48 | "embedding_model_id": settings.TEXT_EMBEDDING_MODEL_ID, 49 | "temperature": settings.TEMPERATURE_INFERENCE, 50 | "query_tokens": misc.compute_num_tokens(query), 51 | "context_tokens": misc.compute_num_tokens(context), 52 | "answer_tokens": misc.compute_num_tokens(answer), 53 | }, 54 | ) 55 | 56 | return answer 57 | 58 | 59 | @app.post("/rag", response_model=QueryResponse) 60 | async def rag_endpoint(request: QueryRequest): 61 | try: 62 | answer = rag(query=request.query) 63 | 64 | return {"answer": answer} 65 | except Exception as e: 66 | raise HTTPException(status_code=500, detail=str(e)) from e 67 | -------------------------------------------------------------------------------- /llm_engineering/infrastructure/opik_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import opik 4 | from loguru import logger 5 | from opik.configurator.configure import OpikConfigurator 6 | 7 | from llm_engineering import settings 8 | 9 | 10 | def configure_opik() -> None: 11 | if settings.COMET_API_KEY and settings.COMET_PROJECT: 12 | try: 13 | client = OpikConfigurator(api_key=settings.COMET_API_KEY) 14 | default_workspace = client._get_default_workspace() 15 | except Exception: 16 | logger.warning("Default workspace not found. Setting workspace to None and enabling interactive mode.") 17 | default_workspace = None 18 | 19 | os.environ["OPIK_PROJECT_NAME"] = settings.COMET_PROJECT 20 | 21 | opik.configure(api_key=settings.COMET_API_KEY, workspace=default_workspace, use_local=False, force=True) 22 | logger.info("Opik configured successfully.") 23 | else: 24 | logger.warning( 25 | "COMET_API_KEY and COMET_PROJECT are not set. Set them to enable prompt monitoring with Opik (powered by Comet ML)." 26 | ) 27 | -------------------------------------------------------------------------------- /llm_engineering/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/model/__init__.py -------------------------------------------------------------------------------- /llm_engineering/model/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/model/evaluation/__init__.py -------------------------------------------------------------------------------- /llm_engineering/model/evaluation/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.43.3 2 | datasets==2.20.0 3 | vllm==0.6.1.post2 4 | tqdm==4.66.4 5 | openai==1.55.3 -------------------------------------------------------------------------------- /llm_engineering/model/evaluation/sagemaker.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import HfApi 4 | from loguru import logger 5 | 6 | try: 7 | from sagemaker.huggingface import HuggingFaceProcessor 8 | except ModuleNotFoundError: 9 | logger.warning("Couldn't load SageMaker imports. Run 'poetry install --with aws' to support AWS.") 10 | 11 | from llm_engineering import settings 12 | 13 | evaluation_dir = Path(__file__).resolve().parent 14 | evaluation_requirements_path = evaluation_dir / "requirements.txt" 15 | 16 | 17 | def run_evaluation_on_sagemaker(is_dummy: bool = True) -> None: 18 | assert settings.HUGGINGFACE_ACCESS_TOKEN, "Hugging Face access token is required." 19 | assert settings.OPENAI_API_KEY, "OpenAI API key is required." 20 | assert settings.AWS_ARN_ROLE, "AWS ARN role is required." 21 | 22 | if not evaluation_dir.exists(): 23 | raise FileNotFoundError(f"The directory {evaluation_dir} does not exist.") 24 | if not evaluation_requirements_path.exists(): 25 | raise FileNotFoundError(f"The file {evaluation_requirements_path} does not exist.") 26 | 27 | api = HfApi() 28 | user_info = api.whoami(token=settings.HUGGINGFACE_ACCESS_TOKEN) 29 | huggingface_user = user_info["name"] 30 | logger.info(f"Current Hugging Face user: {huggingface_user}") 31 | 32 | env = { 33 | "HUGGING_FACE_HUB_TOKEN": settings.HUGGINGFACE_ACCESS_TOKEN, 34 | "OPENAI_API_KEY": settings.OPENAI_API_KEY, 35 | "DATASET_HUGGINGFACE_WORKSPACE": huggingface_user, 36 | "MODEL_HUGGINGFACE_WORKSPACE": huggingface_user, 37 | } 38 | if is_dummy: 39 | env["IS_DUMMY"] = "True" 40 | 41 | # Initialize the HuggingFaceProcessor 42 | hfp = HuggingFaceProcessor( 43 | role=settings.AWS_ARN_ROLE, 44 | instance_count=1, 45 | instance_type="ml.g5.2xlarge", 46 | transformers_version="4.36", 47 | pytorch_version="2.1", 48 | py_version="py310", 49 | base_job_name="evaluate-llm-twin", 50 | env=env, 51 | ) 52 | 53 | # Run the processing job 54 | hfp.run( 55 | code="evaluate.py", 56 | source_dir=str(evaluation_dir), 57 | ) 58 | 59 | 60 | if __name__ == "__main__": 61 | run_evaluation_on_sagemaker() 62 | -------------------------------------------------------------------------------- /llm_engineering/model/finetuning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/model/finetuning/__init__.py -------------------------------------------------------------------------------- /llm_engineering/model/finetuning/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.33.0 2 | torch==2.4.0 3 | transformers==4.43.3 4 | datasets==2.20.0 5 | peft==0.12.0 6 | trl==0.9.6 7 | bitsandbytes==0.43.3 8 | comet-ml==3.44.3 9 | flash-attn==2.3.6 10 | unsloth==2024.9.post2 11 | -------------------------------------------------------------------------------- /llm_engineering/model/finetuning/sagemaker.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from huggingface_hub import HfApi 4 | from loguru import logger 5 | 6 | try: 7 | from sagemaker.huggingface import HuggingFace 8 | except ModuleNotFoundError: 9 | logger.warning("Couldn't load SageMaker imports. Run 'poetry install --with aws' to support AWS.") 10 | 11 | from llm_engineering.settings import settings 12 | 13 | finetuning_dir = Path(__file__).resolve().parent 14 | finetuning_requirements_path = finetuning_dir / "requirements.txt" 15 | 16 | 17 | def run_finetuning_on_sagemaker( 18 | finetuning_type: str = "sft", 19 | num_train_epochs: int = 3, 20 | per_device_train_batch_size: int = 2, 21 | learning_rate: float = 3e-4, 22 | dataset_huggingface_workspace: str = "mlabonne", 23 | is_dummy: bool = False, 24 | ) -> None: 25 | assert settings.HUGGINGFACE_ACCESS_TOKEN, "Hugging Face access token is required." 26 | assert settings.AWS_ARN_ROLE, "AWS ARN role is required." 27 | 28 | if not finetuning_dir.exists(): 29 | raise FileNotFoundError(f"The directory {finetuning_dir} does not exist.") 30 | if not finetuning_requirements_path.exists(): 31 | raise FileNotFoundError(f"The file {finetuning_requirements_path} does not exist.") 32 | 33 | api = HfApi() 34 | user_info = api.whoami(token=settings.HUGGINGFACE_ACCESS_TOKEN) 35 | huggingface_user = user_info["name"] 36 | logger.info(f"Current Hugging Face user: {huggingface_user}") 37 | 38 | hyperparameters = { 39 | "finetuning_type": finetuning_type, 40 | "num_train_epochs": num_train_epochs, 41 | "per_device_train_batch_size": per_device_train_batch_size, 42 | "learning_rate": learning_rate, 43 | "dataset_huggingface_workspace": dataset_huggingface_workspace, 44 | "model_output_huggingface_workspace": huggingface_user, 45 | } 46 | if is_dummy: 47 | hyperparameters["is_dummy"] = True 48 | 49 | # Create the HuggingFace SageMaker estimator 50 | huggingface_estimator = HuggingFace( 51 | entry_point="finetune.py", 52 | source_dir=str(finetuning_dir), 53 | instance_type="ml.g5.2xlarge", 54 | instance_count=1, 55 | role=settings.AWS_ARN_ROLE, 56 | transformers_version="4.36", 57 | pytorch_version="2.1", 58 | py_version="py310", 59 | hyperparameters=hyperparameters, 60 | requirements_file=finetuning_requirements_path, 61 | environment={ 62 | "HUGGING_FACE_HUB_TOKEN": settings.HUGGINGFACE_ACCESS_TOKEN, 63 | "COMET_API_KEY": settings.COMET_API_KEY, 64 | "COMET_PROJECT_NAME": settings.COMET_PROJECT, 65 | }, 66 | ) 67 | 68 | # Start the training job on SageMaker. 69 | huggingface_estimator.fit() 70 | 71 | 72 | if __name__ == "__main__": 73 | run_finetuning_on_sagemaker() 74 | -------------------------------------------------------------------------------- /llm_engineering/model/inference/__init__.py: -------------------------------------------------------------------------------- 1 | from .inference import LLMInferenceSagemakerEndpoint 2 | from .run import InferenceExecutor 3 | 4 | __all__ = ["LLMInferenceSagemakerEndpoint", "InferenceExecutor"] 5 | -------------------------------------------------------------------------------- /llm_engineering/model/inference/inference.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict, Optional 3 | 4 | from loguru import logger 5 | 6 | try: 7 | import boto3 8 | except ModuleNotFoundError: 9 | logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.") 10 | 11 | 12 | from llm_engineering.domain.inference import Inference 13 | from llm_engineering.settings import settings 14 | 15 | 16 | class LLMInferenceSagemakerEndpoint(Inference): 17 | """ 18 | Class for performing inference using a SageMaker endpoint for LLM schemas. 19 | """ 20 | 21 | def __init__( 22 | self, 23 | endpoint_name: str, 24 | default_payload: Optional[Dict[str, Any]] = None, 25 | inference_component_name: Optional[str] = None, 26 | ) -> None: 27 | super().__init__() 28 | 29 | self.client = boto3.client( 30 | "sagemaker-runtime", 31 | region_name=settings.AWS_REGION, 32 | aws_access_key_id=settings.AWS_ACCESS_KEY, 33 | aws_secret_access_key=settings.AWS_SECRET_KEY, 34 | ) 35 | self.endpoint_name = endpoint_name 36 | self.payload = default_payload if default_payload else self._default_payload() 37 | self.inference_component_name = inference_component_name 38 | 39 | def _default_payload(self) -> Dict[str, Any]: 40 | """ 41 | Generates the default payload for the inference request. 42 | 43 | Returns: 44 | dict: The default payload. 45 | """ 46 | 47 | return { 48 | "inputs": "How is the weather?", 49 | "parameters": { 50 | "max_new_tokens": settings.MAX_NEW_TOKENS_INFERENCE, 51 | "top_p": settings.TOP_P_INFERENCE, 52 | "temperature": settings.TEMPERATURE_INFERENCE, 53 | "return_full_text": False, 54 | }, 55 | } 56 | 57 | def set_payload(self, inputs: str, parameters: Optional[Dict[str, Any]] = None) -> None: 58 | """ 59 | Sets the payload for the inference request. 60 | 61 | Args: 62 | inputs (str): The input text for the inference. 63 | parameters (dict, optional): Additional parameters for the inference. Defaults to None. 64 | """ 65 | 66 | self.payload["inputs"] = inputs 67 | if parameters: 68 | self.payload["parameters"].update(parameters) 69 | 70 | def inference(self) -> Dict[str, Any]: 71 | """ 72 | Performs the inference request using the SageMaker endpoint. 73 | 74 | Returns: 75 | dict: The response from the inference request. 76 | Raises: 77 | Exception: If an error occurs during the inference request. 78 | """ 79 | 80 | try: 81 | logger.info("Inference request sent.") 82 | invoke_args = { 83 | "EndpointName": self.endpoint_name, 84 | "ContentType": "application/json", 85 | "Body": json.dumps(self.payload), 86 | } 87 | if self.inference_component_name not in ["None", None]: 88 | invoke_args["InferenceComponentName"] = self.inference_component_name 89 | response = self.client.invoke_endpoint(**invoke_args) 90 | response_body = response["Body"].read().decode("utf8") 91 | 92 | return json.loads(response_body) 93 | 94 | except Exception: 95 | logger.exception("SageMaker inference failed.") 96 | 97 | raise 98 | -------------------------------------------------------------------------------- /llm_engineering/model/inference/run.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from llm_engineering.domain.inference import Inference 4 | from llm_engineering.settings import settings 5 | 6 | 7 | class InferenceExecutor: 8 | def __init__( 9 | self, 10 | llm: Inference, 11 | query: str, 12 | context: str | None = None, 13 | prompt: str | None = None, 14 | ) -> None: 15 | self.llm = llm 16 | self.query = query 17 | self.context = context if context else "" 18 | 19 | if prompt is None: 20 | self.prompt = """ 21 | You are a content creator. Write what the user asked you to while using the provided context as the primary source of information for the content. 22 | User query: {query} 23 | Context: {context} 24 | """ 25 | else: 26 | self.prompt = prompt 27 | 28 | def execute(self) -> str: 29 | self.llm.set_payload( 30 | inputs=self.prompt.format(query=self.query, context=self.context), 31 | parameters={ 32 | "max_new_tokens": settings.MAX_NEW_TOKENS_INFERENCE, 33 | "repetition_penalty": 1.1, 34 | "temperature": settings.TEMPERATURE_INFERENCE, 35 | }, 36 | ) 37 | answer = self.llm.inference()[0]["generated_text"] 38 | 39 | return answer 40 | -------------------------------------------------------------------------------- /llm_engineering/model/inference/test.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | 3 | from llm_engineering.model.inference.inference import LLMInferenceSagemakerEndpoint 4 | from llm_engineering.model.inference.run import InferenceExecutor 5 | from llm_engineering.settings import settings 6 | 7 | if __name__ == "__main__": 8 | text = "Write me a post about AWS SageMaker inference endpoints." 9 | logger.info(f"Running inference for text: '{text}'") 10 | llm = LLMInferenceSagemakerEndpoint( 11 | endpoint_name=settings.SAGEMAKER_ENDPOINT_INFERENCE, inference_component_name=None 12 | ) 13 | answer = InferenceExecutor(llm, text).execute() 14 | 15 | logger.info(f"Answer: '{answer}'") 16 | -------------------------------------------------------------------------------- /llm_engineering/model/utils.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | 3 | try: 4 | import boto3 5 | from botocore.exceptions import ClientError 6 | except ModuleNotFoundError: 7 | logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.") 8 | 9 | from llm_engineering.settings import settings 10 | 11 | 12 | class ResourceManager: 13 | def __init__(self) -> None: 14 | self.sagemaker_client = boto3.client( 15 | "sagemaker", 16 | region_name=settings.AWS_REGION, 17 | aws_access_key_id=settings.AWS_ACCESS_KEY, 18 | aws_secret_access_key=settings.AWS_SECRET_KEY, 19 | ) 20 | 21 | def endpoint_config_exists(self, endpoint_config_name: str) -> bool: 22 | """Check if the SageMaker endpoint configuration exists.""" 23 | try: 24 | self.sagemaker_client.describe_endpoint_config(EndpointConfigName=endpoint_config_name) 25 | logger.info(f"Endpoint configuration '{endpoint_config_name}' exists.") 26 | return True 27 | except ClientError: 28 | logger.info(f"Endpoint configuration '{endpoint_config_name}' does not exist.") 29 | return False 30 | 31 | def endpoint_exists(self, endpoint_name: str) -> bool: 32 | """Check if the SageMaker endpoint exists.""" 33 | try: 34 | self.sagemaker_client.describe_endpoint(EndpointName=endpoint_name) 35 | logger.info(f"Endpoint '{endpoint_name}' exists.") 36 | return True 37 | except self.sagemaker_client.exceptions.ResourceNotFoundException: 38 | logger.info(f"Endpoint '{endpoint_name}' does not exist.") 39 | return False 40 | -------------------------------------------------------------------------------- /llm_engineering/settings.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from pydantic_settings import BaseSettings, SettingsConfigDict 3 | from zenml.client import Client 4 | from zenml.exceptions import EntityExistsError 5 | 6 | 7 | class Settings(BaseSettings): 8 | model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8") 9 | 10 | # --- Required settings even when working locally. --- 11 | 12 | # OpenAI API 13 | OPENAI_MODEL_ID: str = "gpt-4o-mini" 14 | OPENAI_API_KEY: str | None = None 15 | 16 | # Huggingface API 17 | HUGGINGFACE_ACCESS_TOKEN: str | None = None 18 | 19 | # Comet ML (during training) 20 | COMET_API_KEY: str | None = None 21 | COMET_PROJECT: str = "twin" 22 | 23 | # --- Required settings when deploying the code. --- 24 | # --- Otherwise, default values values work fine. --- 25 | 26 | # MongoDB database 27 | DATABASE_HOST: str = "mongodb://llm_engineering:llm_engineering@127.0.0.1:27017" 28 | DATABASE_NAME: str = "twin" 29 | 30 | # Qdrant vector database 31 | USE_QDRANT_CLOUD: bool = False 32 | QDRANT_DATABASE_HOST: str = "localhost" 33 | QDRANT_DATABASE_PORT: int = 6333 34 | QDRANT_CLOUD_URL: str = "str" 35 | QDRANT_APIKEY: str | None = None 36 | 37 | # AWS Authentication 38 | AWS_REGION: str = "eu-central-1" 39 | AWS_ACCESS_KEY: str | None = None 40 | AWS_SECRET_KEY: str | None = None 41 | AWS_ARN_ROLE: str | None = None 42 | 43 | # --- Optional settings used to tweak the code. --- 44 | 45 | # AWS SageMaker 46 | HF_MODEL_ID: str = "mlabonne/TwinLlama-3.1-8B-DPO" 47 | GPU_INSTANCE_TYPE: str = "ml.g5.2xlarge" 48 | SM_NUM_GPUS: int = 1 49 | MAX_INPUT_LENGTH: int = 2048 50 | MAX_TOTAL_TOKENS: int = 4096 51 | MAX_BATCH_TOTAL_TOKENS: int = 4096 52 | COPIES: int = 1 # Number of replicas 53 | GPUS: int = 1 # Number of GPUs 54 | CPUS: int = 2 # Number of CPU cores 55 | 56 | SAGEMAKER_ENDPOINT_CONFIG_INFERENCE: str = "twin" 57 | SAGEMAKER_ENDPOINT_INFERENCE: str = "twin" 58 | TEMPERATURE_INFERENCE: float = 0.01 59 | TOP_P_INFERENCE: float = 0.9 60 | MAX_NEW_TOKENS_INFERENCE: int = 150 61 | 62 | # RAG 63 | TEXT_EMBEDDING_MODEL_ID: str = "sentence-transformers/all-MiniLM-L6-v2" 64 | RERANKING_CROSS_ENCODER_MODEL_ID: str = "cross-encoder/ms-marco-MiniLM-L-4-v2" 65 | RAG_MODEL_DEVICE: str = "cpu" 66 | 67 | # LinkedIn Credentials 68 | LINKEDIN_USERNAME: str | None = None 69 | LINKEDIN_PASSWORD: str | None = None 70 | 71 | @property 72 | def OPENAI_MAX_TOKEN_WINDOW(self) -> int: 73 | official_max_token_window = { 74 | "gpt-3.5-turbo": 16385, 75 | "gpt-4-turbo": 128000, 76 | "gpt-4o": 128000, 77 | "gpt-4o-mini": 128000, 78 | }.get(self.OPENAI_MODEL_ID, 128000) 79 | 80 | max_token_window = int(official_max_token_window * 0.90) 81 | 82 | return max_token_window 83 | 84 | @classmethod 85 | def load_settings(cls) -> "Settings": 86 | """ 87 | Tries to load the settings from the ZenML secret store. If the secret does not exist, it initializes the settings from the .env file and default values. 88 | 89 | Returns: 90 | Settings: The initialized settings object. 91 | """ 92 | 93 | try: 94 | logger.info("Loading settings from the ZenML secret store.") 95 | 96 | settings_secrets = Client().get_secret("settings") 97 | settings = Settings(**settings_secrets.secret_values) 98 | except (RuntimeError, KeyError): 99 | logger.warning( 100 | "Failed to load settings from the ZenML secret store. Defaulting to loading the settings from the '.env' file." 101 | ) 102 | settings = Settings() 103 | 104 | return settings 105 | 106 | def export(self) -> None: 107 | """ 108 | Exports the settings to the ZenML secret store. 109 | """ 110 | 111 | env_vars = settings.model_dump() 112 | for key, value in env_vars.items(): 113 | env_vars[key] = str(value) 114 | 115 | client = Client() 116 | 117 | try: 118 | client.create_secret(name="settings", values=env_vars) 119 | except EntityExistsError: 120 | logger.warning( 121 | "Secret 'scope' already exists. Delete it manually by running 'zenml secret delete settings', before trying to recreate it." 122 | ) 123 | 124 | 125 | settings = Settings.load_settings() 126 | -------------------------------------------------------------------------------- /pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .digital_data_etl import digital_data_etl 2 | from .end_to_end_data import end_to_end_data 3 | from .evaluating import evaluating 4 | from .export_artifact_to_json import export_artifact_to_json 5 | from .feature_engineering import feature_engineering 6 | from .generate_datasets import generate_datasets 7 | from .training import training 8 | 9 | __all__ = [ 10 | "generate_datasets", 11 | "end_to_end_data", 12 | "evaluating", 13 | "export_artifact_to_json", 14 | "digital_data_etl", 15 | "feature_engineering", 16 | "training", 17 | ] 18 | -------------------------------------------------------------------------------- /pipelines/digital_data_etl.py: -------------------------------------------------------------------------------- 1 | from zenml import pipeline 2 | 3 | from steps.etl import crawl_links, get_or_create_user 4 | 5 | 6 | @pipeline 7 | def digital_data_etl(user_full_name: str, links: list[str]) -> str: 8 | user = get_or_create_user(user_full_name) 9 | last_step = crawl_links(user=user, links=links) 10 | 11 | return last_step.invocation_id 12 | -------------------------------------------------------------------------------- /pipelines/end_to_end_data.py: -------------------------------------------------------------------------------- 1 | from zenml import pipeline 2 | 3 | from .digital_data_etl import digital_data_etl 4 | from .feature_engineering import feature_engineering 5 | from .generate_datasets import generate_datasets 6 | 7 | 8 | @pipeline 9 | def end_to_end_data( 10 | author_links: list[dict[str, str | list[str]]], 11 | test_split_size: float = 0.1, 12 | push_to_huggingface: bool = False, 13 | dataset_id: str | None = None, 14 | mock: bool = False, 15 | ) -> None: 16 | wait_for_ids = [] 17 | for author_data in author_links: 18 | last_step_invocation_id = digital_data_etl( 19 | user_full_name=author_data["user_full_name"], links=author_data["links"] 20 | ) 21 | 22 | wait_for_ids.append(last_step_invocation_id) 23 | 24 | author_full_names = [author_data["user_full_name"] for author_data in author_links] 25 | wait_for_ids = feature_engineering(author_full_names=author_full_names, wait_for=wait_for_ids) 26 | 27 | generate_datasets( 28 | test_split_size=test_split_size, 29 | push_to_huggingface=push_to_huggingface, 30 | dataset_id=dataset_id, 31 | mock=mock, 32 | wait_for=wait_for_ids, 33 | ) 34 | -------------------------------------------------------------------------------- /pipelines/evaluating.py: -------------------------------------------------------------------------------- 1 | from zenml import pipeline 2 | 3 | from steps import evaluating as evaluating_steps 4 | 5 | 6 | @pipeline 7 | def evaluating( 8 | is_dummy: bool = False, 9 | ) -> None: 10 | evaluating_steps.evaluate( 11 | is_dummy=is_dummy, 12 | ) 13 | -------------------------------------------------------------------------------- /pipelines/export_artifact_to_json.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from zenml import pipeline 4 | from zenml.client import Client 5 | 6 | from steps import export as export_steps 7 | 8 | 9 | @pipeline 10 | def export_artifact_to_json(artifact_names: list[str], output_dir: Path = Path("output")) -> None: 11 | for artifact_name in artifact_names: 12 | artifact = Client().get_artifact_version(name_id_or_prefix=artifact_name) 13 | 14 | data = export_steps.serialize_artifact(artifact=artifact, artifact_name=artifact_name) 15 | 16 | export_steps.to_json(data=data, to_file=output_dir / f"{artifact_name}.json") 17 | -------------------------------------------------------------------------------- /pipelines/feature_engineering.py: -------------------------------------------------------------------------------- 1 | from zenml import pipeline 2 | 3 | from steps import feature_engineering as fe_steps 4 | 5 | 6 | @pipeline 7 | def feature_engineering(author_full_names: list[str], wait_for: str | list[str] | None = None) -> list[str]: 8 | raw_documents = fe_steps.query_data_warehouse(author_full_names, after=wait_for) 9 | 10 | cleaned_documents = fe_steps.clean_documents(raw_documents) 11 | last_step_1 = fe_steps.load_to_vector_db(cleaned_documents) 12 | 13 | embedded_documents = fe_steps.chunk_and_embed(cleaned_documents) 14 | last_step_2 = fe_steps.load_to_vector_db(embedded_documents) 15 | 16 | return [last_step_1.invocation_id, last_step_2.invocation_id] 17 | -------------------------------------------------------------------------------- /pipelines/generate_datasets.py: -------------------------------------------------------------------------------- 1 | from zenml import pipeline 2 | 3 | from llm_engineering.domain.dataset import DatasetType 4 | from steps import generate_datasets as cd_steps 5 | 6 | 7 | @pipeline 8 | def generate_datasets( 9 | dataset_type: DatasetType = DatasetType.INSTRUCTION, 10 | test_split_size: float = 0.1, 11 | push_to_huggingface: bool = False, 12 | dataset_id: str | None = None, 13 | mock: bool = False, 14 | wait_for: str | list[str] | None = None, 15 | ) -> None: 16 | cleaned_documents = cd_steps.query_feature_store(after=wait_for) 17 | prompts = cd_steps.create_prompts(documents=cleaned_documents, dataset_type=dataset_type) 18 | if dataset_type == DatasetType.INSTRUCTION: 19 | dataset = cd_steps.generate_intruction_dataset(prompts=prompts, test_split_size=test_split_size, mock=mock) 20 | elif dataset_type == DatasetType.PREFERENCE: 21 | dataset = cd_steps.generate_preference_dataset(prompts=prompts, test_split_size=test_split_size, mock=mock) 22 | else: 23 | raise ValueError(f"Invalid dataset type: {dataset_type}") 24 | 25 | if push_to_huggingface: 26 | cd_steps.push_to_huggingface(dataset=dataset, dataset_id=dataset_id) 27 | -------------------------------------------------------------------------------- /pipelines/training.py: -------------------------------------------------------------------------------- 1 | from zenml import pipeline 2 | 3 | from steps import training as training_steps 4 | 5 | 6 | @pipeline 7 | def training( 8 | finetuning_type: str = "sft", 9 | num_train_epochs: int = 3, 10 | per_device_train_batch_size: int = 2, 11 | learning_rate: float = 3e-4, 12 | dataset_huggingface_workspace: str = "mlabonne", 13 | is_dummy: bool = False, 14 | ) -> None: 15 | training_steps.train( 16 | finetuning_type=finetuning_type, 17 | num_train_epochs=num_train_epochs, 18 | per_device_train_batch_size=per_device_train_batch_size, 19 | learning_rate=learning_rate, 20 | dataset_huggingface_workspace=dataset_huggingface_workspace, 21 | is_dummy=is_dummy, 22 | ) 23 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "llm-engineering" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["iusztinpaul "] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "~3.11" 11 | zenml = { version = "0.74.0", extras = ["server"] } 12 | pymongo = "^4.6.2" 13 | click = "^8.0.1" 14 | loguru = "^0.7.2" 15 | rich = "^13.7.1" 16 | numpy = "^1.26.4" 17 | poethepoet = "0.29.0" 18 | datasets = "^3.0.1" 19 | torch = "2.2.2" 20 | 21 | # Digital data ETL 22 | selenium = "^4.21.0" 23 | webdriver-manager = "^4.0.1" 24 | beautifulsoup4 = "^4.12.3" 25 | html2text = "^2024.2.26" 26 | jmespath = "^1.0.1" 27 | chromedriver-autoinstaller = "^0.6.4" 28 | 29 | # Feature engineering 30 | qdrant-client = "^1.8.0" 31 | langchain = "^0.2.11" 32 | sentence-transformers = "^3.0.0" 33 | 34 | # RAG 35 | langchain-openai = "^0.1.3" 36 | jinja2 = "^3.1.4" 37 | tiktoken = "^0.7.0" 38 | fake-useragent = "^1.5.1" 39 | langchain-community = "^0.2.11" 40 | 41 | # Inference 42 | fastapi = ">=0.100,<=0.110" 43 | uvicorn = "^0.30.6" 44 | opik = "^0.2.2" 45 | 46 | 47 | [tool.poetry.group.dev.dependencies] 48 | ruff = "^0.4.9" 49 | pre-commit = "^3.7.1" 50 | pytest = "^8.2.2" 51 | 52 | 53 | [tool.poetry.group.aws.dependencies] 54 | sagemaker = ">=2.232.2" 55 | s3fs = ">2022.3.0" 56 | aws-profile-manager = "^0.7.3" 57 | kubernetes = "^30.1.0" 58 | sagemaker-huggingface-inference-toolkit = "^2.4.0" 59 | 60 | 61 | [build-system] 62 | requires = ["poetry-core"] 63 | build-backend = "poetry.core.masonry.api" 64 | 65 | # ---------------------------------- 66 | # --- Poe the Poet Configuration --- 67 | # ---------------------------------- 68 | 69 | [tool.poe.tasks] 70 | # Data pipelines 71 | run-digital-data-etl-alex = "echo 'It is not supported anymore.'" 72 | run-digital-data-etl-maxime = "poetry run python -m tools.run --run-etl --no-cache --etl-config-filename digital_data_etl_maxime_labonne.yaml" 73 | run-digital-data-etl-paul = "poetry run python -m tools.run --run-etl --no-cache --etl-config-filename digital_data_etl_paul_iusztin.yaml" 74 | run-digital-data-etl = [ 75 | "run-digital-data-etl-maxime", 76 | "run-digital-data-etl-paul", 77 | ] 78 | run-feature-engineering-pipeline = "poetry run python -m tools.run --no-cache --run-feature-engineering" 79 | run-generate-instruct-datasets-pipeline = "poetry run python -m tools.run --no-cache --run-generate-instruct-datasets" 80 | run-generate-preference-datasets-pipeline = "poetry run python -m tools.run --no-cache --run-generate-preference-datasets" 81 | run-end-to-end-data-pipeline = "poetry run python -m tools.run --no-cache --run-end-to-end-data" 82 | 83 | # Utility pipelines 84 | run-export-artifact-to-json-pipeline = "poetry run python -m tools.run --no-cache --run-export-artifact-to-json" 85 | run-export-data-warehouse-to-json = "poetry run python -m tools.data_warehouse --export-raw-data" 86 | run-import-data-warehouse-from-json = "poetry run python -m tools.data_warehouse --import-raw-data" 87 | 88 | # Training pipelines 89 | run-training-pipeline = "poetry run python -m tools.run --no-cache --run-training" 90 | run-evaluation-pipeline = "poetry run python -m tools.run --no-cache --run-evaluation" 91 | 92 | # Inference 93 | call-rag-retrieval-module = "poetry run python -m tools.rag" 94 | 95 | run-inference-ml-service = "poetry run uvicorn tools.ml_service:app --host 0.0.0.0 --port 8000 --reload" 96 | call-inference-ml-service = "curl -X POST 'http://127.0.0.1:8000/rag' -H 'Content-Type: application/json' -d '{\"query\": \"My name is Paul Iusztin. Could you draft a LinkedIn post discussing RAG systems? I am particularly interested in how RAG works and how it is integrated with vector DBs and LLMs.\"}'" 97 | 98 | # Infrastructure 99 | ## Local infrastructure 100 | local-docker-infrastructure-up = "docker compose up -d" 101 | local-docker-infrastructure-down = "docker compose stop" 102 | local-zenml-server-down = "poetry run zenml logout --local" 103 | local-infrastructure-up = [ 104 | "local-docker-infrastructure-up", 105 | "local-zenml-server-down", 106 | "local-zenml-server-up", 107 | ] 108 | local-infrastructure-down = [ 109 | "local-docker-infrastructure-down", 110 | "local-zenml-server-down", 111 | ] 112 | set-local-stack = "poetry run zenml stack set default" 113 | set-aws-stack = "poetry run zenml stack set aws-stack" 114 | set-asynchronous-runs = "poetry run zenml orchestrator update aws-stack --synchronous=False" 115 | zenml-server-disconnect = "poetry run zenml disconnect" 116 | 117 | ## Settings 118 | export-settings-to-zenml = "poetry run python -m tools.run --export-settings" 119 | delete-settings-zenml = "poetry run zenml secret delete settings" 120 | 121 | ## SageMaker 122 | create-sagemaker-role = "poetry run python -m llm_engineering.infrastructure.aws.roles.create_sagemaker_role" 123 | create-sagemaker-execution-role = "poetry run python -m llm_engineering.infrastructure.aws.roles.create_execution_role" 124 | deploy-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.huggingface.run" 125 | test-sagemaker-endpoint = "poetry run python -m llm_engineering.model.inference.test" 126 | delete-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.delete_sagemaker_endpoint" 127 | 128 | ## Docker 129 | build-docker-image = "docker buildx build --platform linux/amd64 -t llmtwin -f Dockerfile ." 130 | run-docker-end-to-end-data-pipeline = "docker run --rm --network host --shm-size=2g --env-file .env llmtwin poetry poe --no-cache --run-end-to-end-data" 131 | bash-docker-container = "docker run --rm -it --network host --env-file .env llmtwin bash" 132 | 133 | # QA 134 | lint-check = "poetry run ruff check ." 135 | format-check = "poetry run ruff format --check ." 136 | lint-check-docker = "sh -c 'docker run --rm -i hadolint/hadolint < Dockerfile'" 137 | gitleaks-check = "docker run -v .:/src zricethezav/gitleaks:latest dir /src/llm_engineering" 138 | lint-fix = "poetry run ruff check --fix ." 139 | format-fix = "poetry run ruff format ." 140 | 141 | [tool.poe.tasks.local-zenml-server-up] 142 | control.expr = "sys.platform" 143 | 144 | [[tool.poe.tasks.local-zenml-server-up.switch]] 145 | case = "darwin" 146 | env = { OBJC_DISABLE_INITIALIZE_FORK_SAFETY = "YES" } 147 | cmd = "poetry run zenml login --local" 148 | 149 | [[tool.poe.tasks.local-zenml-server-up.switch]] 150 | case = "win32" 151 | cmd = "poetry run zenml login --local --blocking" 152 | 153 | [[tool.poe.tasks.local-zenml-server-up.switch]] 154 | cmd = "poetry run zenml login --local" 155 | 156 | # Tests 157 | [tool.poe.tasks.test] 158 | cmd = "poetry run pytest tests/" 159 | env = { ENV_FILE = ".env.testing" } 160 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | line-length = 120 2 | target-version = "py311" 3 | extend-exclude = [ 4 | ".github", 5 | "graphql_client", 6 | "graphql_schemas" 7 | ] 8 | 9 | [lint] 10 | extend-select = [ 11 | "I", 12 | "B", 13 | "G", 14 | "T20", 15 | "PTH", 16 | "RUF" 17 | ] 18 | 19 | [lint.isort] 20 | case-sensitive = true 21 | 22 | [lint.pydocstyle] 23 | convention = "google" -------------------------------------------------------------------------------- /steps/__init__.py: -------------------------------------------------------------------------------- 1 | from . import etl, evaluating, export, feature_engineering, generate_datasets, training 2 | 3 | __all__ = ["generate_datasets", "export", "etl", "feature_engineering", "training", "evaluating"] 4 | -------------------------------------------------------------------------------- /steps/etl/__init__.py: -------------------------------------------------------------------------------- 1 | from .crawl_links import crawl_links 2 | from .get_or_create_user import get_or_create_user 3 | 4 | __all__ = ["crawl_links", "get_or_create_user"] 5 | -------------------------------------------------------------------------------- /steps/etl/crawl_links.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | from loguru import logger 4 | from tqdm import tqdm 5 | from typing_extensions import Annotated 6 | from zenml import get_step_context, step 7 | 8 | from llm_engineering.application.crawlers.dispatcher import CrawlerDispatcher 9 | from llm_engineering.domain.documents import UserDocument 10 | 11 | 12 | @step 13 | def crawl_links(user: UserDocument, links: list[str]) -> Annotated[list[str], "crawled_links"]: 14 | dispatcher = CrawlerDispatcher.build().register_linkedin().register_medium().register_github() 15 | 16 | logger.info(f"Starting to crawl {len(links)} link(s).") 17 | 18 | metadata = {} 19 | successfull_crawls = 0 20 | for link in tqdm(links): 21 | successfull_crawl, crawled_domain = _crawl_link(dispatcher, link, user) 22 | successfull_crawls += successfull_crawl 23 | 24 | metadata = _add_to_metadata(metadata, crawled_domain, successfull_crawl) 25 | 26 | step_context = get_step_context() 27 | step_context.add_output_metadata(output_name="crawled_links", metadata=metadata) 28 | 29 | logger.info(f"Successfully crawled {successfull_crawls} / {len(links)} links.") 30 | 31 | return links 32 | 33 | 34 | def _crawl_link(dispatcher: CrawlerDispatcher, link: str, user: UserDocument) -> tuple[bool, str]: 35 | crawler = dispatcher.get_crawler(link) 36 | crawler_domain = urlparse(link).netloc 37 | 38 | try: 39 | crawler.extract(link=link, user=user) 40 | 41 | return (True, crawler_domain) 42 | except Exception as e: 43 | logger.error(f"An error occurred while crowling: {e!s}") 44 | 45 | return (False, crawler_domain) 46 | 47 | 48 | def _add_to_metadata(metadata: dict, domain: str, successfull_crawl: bool) -> dict: 49 | if domain not in metadata: 50 | metadata[domain] = {} 51 | metadata[domain]["successful"] = metadata[domain].get("successful", 0) + successfull_crawl 52 | metadata[domain]["total"] = metadata[domain].get("total", 0) + 1 53 | 54 | return metadata 55 | -------------------------------------------------------------------------------- /steps/etl/get_or_create_user.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from typing_extensions import Annotated 3 | from zenml import get_step_context, step 4 | 5 | from llm_engineering.application import utils 6 | from llm_engineering.domain.documents import UserDocument 7 | 8 | 9 | @step 10 | def get_or_create_user(user_full_name: str) -> Annotated[UserDocument, "user"]: 11 | logger.info(f"Getting or creating user: {user_full_name}") 12 | 13 | first_name, last_name = utils.split_user_full_name(user_full_name) 14 | 15 | user = UserDocument.get_or_create(first_name=first_name, last_name=last_name) 16 | 17 | step_context = get_step_context() 18 | step_context.add_output_metadata(output_name="user", metadata=_get_metadata(user_full_name, user)) 19 | 20 | return user 21 | 22 | 23 | def _get_metadata(user_full_name: str, user: UserDocument) -> dict: 24 | return { 25 | "query": { 26 | "user_full_name": user_full_name, 27 | }, 28 | "retrieved": { 29 | "user_id": str(user.id), 30 | "first_name": user.first_name, 31 | "last_name": user.last_name, 32 | }, 33 | } 34 | -------------------------------------------------------------------------------- /steps/evaluating/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluate import evaluate 2 | 3 | __all__ = ["evaluate"] 4 | -------------------------------------------------------------------------------- /steps/evaluating/evaluate.py: -------------------------------------------------------------------------------- 1 | from zenml import step 2 | 3 | from llm_engineering.model.evaluation.sagemaker import run_evaluation_on_sagemaker 4 | 5 | 6 | @step 7 | def evaluate( 8 | is_dummy: bool = False, 9 | ) -> None: 10 | run_evaluation_on_sagemaker( 11 | is_dummy=is_dummy, 12 | ) 13 | -------------------------------------------------------------------------------- /steps/export/__init__.py: -------------------------------------------------------------------------------- 1 | from .serialize_artifact import serialize_artifact 2 | from .to_json import to_json 3 | 4 | __all__ = ["to_json", "serialize_artifact"] 5 | -------------------------------------------------------------------------------- /steps/export/serialize_artifact.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from pydantic import BaseModel 4 | from typing_extensions import Annotated 5 | from zenml import get_step_context, step 6 | 7 | 8 | @step 9 | def serialize_artifact(artifact: Any, artifact_name: str) -> Annotated[dict, "serialized_artifact"]: 10 | serialized_artifact = _serialize_artifact(artifact) 11 | 12 | if serialize_artifact is None: 13 | raise ValueError("Artifact is None") 14 | elif not isinstance(serialized_artifact, dict): 15 | serialized_artifact = {"artifact_data": serialized_artifact} 16 | 17 | step_context = get_step_context() 18 | step_context.add_output_metadata(output_name="serialized_artifact", metadata={"artifact_name": artifact_name}) 19 | 20 | return serialized_artifact 21 | 22 | 23 | def _serialize_artifact(arfifact: list | dict | BaseModel | str | int | float | bool | None): 24 | if isinstance(arfifact, list): 25 | return [_serialize_artifact(item) for item in arfifact] 26 | elif isinstance(arfifact, dict): 27 | return {key: _serialize_artifact(value) for key, value in arfifact.items()} 28 | if isinstance(arfifact, BaseModel): 29 | return arfifact.model_dump() 30 | else: 31 | return arfifact 32 | -------------------------------------------------------------------------------- /steps/export/to_json.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from typing_extensions import Annotated 4 | from zenml import step 5 | 6 | from llm_engineering.infrastructure.files_io import JsonFileManager 7 | 8 | 9 | @step 10 | def to_json( 11 | data: Annotated[dict, "serialized_artifact"], 12 | to_file: Annotated[Path, "to_file"], 13 | ) -> Annotated[Path, "exported_file_path"]: 14 | absolute_file_path = JsonFileManager.write( 15 | filename=to_file, 16 | data=data, 17 | ) 18 | 19 | return absolute_file_path 20 | -------------------------------------------------------------------------------- /steps/feature_engineering/__init__.py: -------------------------------------------------------------------------------- 1 | from .clean import clean_documents 2 | from .load_to_vector_db import load_to_vector_db 3 | from .query_data_warehouse import query_data_warehouse 4 | from .rag import chunk_and_embed 5 | 6 | __all__ = [ 7 | "clean_documents", 8 | "load_to_vector_db", 9 | "query_data_warehouse", 10 | "chunk_and_embed", 11 | ] 12 | -------------------------------------------------------------------------------- /steps/feature_engineering/clean.py: -------------------------------------------------------------------------------- 1 | from typing_extensions import Annotated 2 | from zenml import get_step_context, step 3 | 4 | from llm_engineering.application.preprocessing import CleaningDispatcher 5 | from llm_engineering.domain.cleaned_documents import CleanedDocument 6 | 7 | 8 | @step 9 | def clean_documents( 10 | documents: Annotated[list, "raw_documents"], 11 | ) -> Annotated[list, "cleaned_documents"]: 12 | cleaned_documents = [] 13 | for document in documents: 14 | cleaned_document = CleaningDispatcher.dispatch(document) 15 | cleaned_documents.append(cleaned_document) 16 | 17 | step_context = get_step_context() 18 | step_context.add_output_metadata(output_name="cleaned_documents", metadata=_get_metadata(cleaned_documents)) 19 | 20 | return cleaned_documents 21 | 22 | 23 | def _get_metadata(cleaned_documents: list[CleanedDocument]) -> dict: 24 | metadata = {"num_documents": len(cleaned_documents)} 25 | for document in cleaned_documents: 26 | category = document.get_category() 27 | if category not in metadata: 28 | metadata[category] = {} 29 | if "authors" not in metadata[category]: 30 | metadata[category]["authors"] = list() 31 | 32 | metadata[category]["num_documents"] = metadata[category].get("num_documents", 0) + 1 33 | metadata[category]["authors"].append(document.author_full_name) 34 | 35 | for value in metadata.values(): 36 | if isinstance(value, dict) and "authors" in value: 37 | value["authors"] = list(set(value["authors"])) 38 | 39 | return metadata 40 | -------------------------------------------------------------------------------- /steps/feature_engineering/load_to_vector_db.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from typing_extensions import Annotated 3 | from zenml import step 4 | 5 | from llm_engineering.application import utils 6 | from llm_engineering.domain.base import VectorBaseDocument 7 | 8 | 9 | @step 10 | def load_to_vector_db( 11 | documents: Annotated[list, "documents"], 12 | ) -> Annotated[bool, "successful"]: 13 | logger.info(f"Loading {len(documents)} documents into the vector database.") 14 | 15 | grouped_documents = VectorBaseDocument.group_by_class(documents) 16 | for document_class, documents in grouped_documents.items(): 17 | logger.info(f"Loading documents into {document_class.get_collection_name()}") 18 | for documents_batch in utils.misc.batch(documents, size=4): 19 | try: 20 | document_class.bulk_insert(documents_batch) 21 | except Exception: 22 | logger.error(f"Failed to insert documents into {document_class.get_collection_name()}") 23 | 24 | return False 25 | 26 | return True 27 | -------------------------------------------------------------------------------- /steps/feature_engineering/query_data_warehouse.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor, as_completed 2 | 3 | from loguru import logger 4 | from typing_extensions import Annotated 5 | from zenml import get_step_context, step 6 | 7 | from llm_engineering.application import utils 8 | from llm_engineering.domain.base.nosql import NoSQLBaseDocument 9 | from llm_engineering.domain.documents import ArticleDocument, Document, PostDocument, RepositoryDocument, UserDocument 10 | 11 | 12 | @step 13 | def query_data_warehouse( 14 | author_full_names: list[str], 15 | ) -> Annotated[list, "raw_documents"]: 16 | documents = [] 17 | authors = [] 18 | for author_full_name in author_full_names: 19 | logger.info(f"Querying data warehouse for user: {author_full_name}") 20 | 21 | first_name, last_name = utils.split_user_full_name(author_full_name) 22 | logger.info(f"First name: {first_name}, Last name: {last_name}") 23 | user = UserDocument.get_or_create(first_name=first_name, last_name=last_name) 24 | authors.append(user) 25 | 26 | results = fetch_all_data(user) 27 | user_documents = [doc for query_result in results.values() for doc in query_result] 28 | 29 | documents.extend(user_documents) 30 | 31 | step_context = get_step_context() 32 | step_context.add_output_metadata(output_name="raw_documents", metadata=_get_metadata(documents)) 33 | 34 | return documents 35 | 36 | 37 | def fetch_all_data(user: UserDocument) -> dict[str, list[NoSQLBaseDocument]]: 38 | user_id = str(user.id) 39 | with ThreadPoolExecutor() as executor: 40 | future_to_query = { 41 | executor.submit(__fetch_articles, user_id): "articles", 42 | executor.submit(__fetch_posts, user_id): "posts", 43 | executor.submit(__fetch_repositories, user_id): "repositories", 44 | } 45 | 46 | results = {} 47 | for future in as_completed(future_to_query): 48 | query_name = future_to_query[future] 49 | try: 50 | results[query_name] = future.result() 51 | except Exception: 52 | logger.exception(f"'{query_name}' request failed.") 53 | 54 | results[query_name] = [] 55 | 56 | return results 57 | 58 | 59 | def __fetch_articles(user_id) -> list[NoSQLBaseDocument]: 60 | return ArticleDocument.bulk_find(author_id=user_id) 61 | 62 | 63 | def __fetch_posts(user_id) -> list[NoSQLBaseDocument]: 64 | return PostDocument.bulk_find(author_id=user_id) 65 | 66 | 67 | def __fetch_repositories(user_id) -> list[NoSQLBaseDocument]: 68 | return RepositoryDocument.bulk_find(author_id=user_id) 69 | 70 | 71 | def _get_metadata(documents: list[Document]) -> dict: 72 | metadata = { 73 | "num_documents": len(documents), 74 | } 75 | for document in documents: 76 | collection = document.get_collection_name() 77 | if collection not in metadata: 78 | metadata[collection] = {} 79 | if "authors" not in metadata[collection]: 80 | metadata[collection]["authors"] = list() 81 | 82 | metadata[collection]["num_documents"] = metadata[collection].get("num_documents", 0) + 1 83 | metadata[collection]["authors"].append(document.author_full_name) 84 | 85 | for value in metadata.values(): 86 | if isinstance(value, dict) and "authors" in value: 87 | value["authors"] = list(set(value["authors"])) 88 | 89 | return metadata 90 | -------------------------------------------------------------------------------- /steps/feature_engineering/rag.py: -------------------------------------------------------------------------------- 1 | from typing_extensions import Annotated 2 | from zenml import get_step_context, step 3 | 4 | from llm_engineering.application import utils 5 | from llm_engineering.application.preprocessing import ChunkingDispatcher, EmbeddingDispatcher 6 | from llm_engineering.domain.chunks import Chunk 7 | from llm_engineering.domain.embedded_chunks import EmbeddedChunk 8 | 9 | 10 | @step 11 | def chunk_and_embed( 12 | cleaned_documents: Annotated[list, "cleaned_documents"], 13 | ) -> Annotated[list, "embedded_documents"]: 14 | metadata = {"chunking": {}, "embedding": {}, "num_documents": len(cleaned_documents)} 15 | 16 | embedded_chunks = [] 17 | for document in cleaned_documents: 18 | chunks = ChunkingDispatcher.dispatch(document) 19 | metadata["chunking"] = _add_chunks_metadata(chunks, metadata["chunking"]) 20 | 21 | for batched_chunks in utils.misc.batch(chunks, 10): 22 | batched_embedded_chunks = EmbeddingDispatcher.dispatch(batched_chunks) 23 | embedded_chunks.extend(batched_embedded_chunks) 24 | 25 | metadata["embedding"] = _add_embeddings_metadata(embedded_chunks, metadata["embedding"]) 26 | metadata["num_chunks"] = len(embedded_chunks) 27 | metadata["num_embedded_chunks"] = len(embedded_chunks) 28 | 29 | step_context = get_step_context() 30 | step_context.add_output_metadata(output_name="embedded_documents", metadata=metadata) 31 | 32 | return embedded_chunks 33 | 34 | 35 | def _add_chunks_metadata(chunks: list[Chunk], metadata: dict) -> dict: 36 | for chunk in chunks: 37 | category = chunk.get_category() 38 | if category not in metadata: 39 | metadata[category] = chunk.metadata 40 | if "authors" not in metadata[category]: 41 | metadata[category]["authors"] = list() 42 | 43 | metadata[category]["num_chunks"] = metadata[category].get("num_chunks", 0) + 1 44 | metadata[category]["authors"].append(chunk.author_full_name) 45 | 46 | for value in metadata.values(): 47 | if isinstance(value, dict) and "authors" in value: 48 | value["authors"] = list(set(value["authors"])) 49 | 50 | return metadata 51 | 52 | 53 | def _add_embeddings_metadata(embedded_chunks: list[EmbeddedChunk], metadata: dict) -> dict: 54 | for embedded_chunk in embedded_chunks: 55 | category = embedded_chunk.get_category() 56 | if category not in metadata: 57 | metadata[category] = embedded_chunk.metadata 58 | if "authors" not in metadata[category]: 59 | metadata[category]["authors"] = list() 60 | 61 | metadata[category]["authors"].append(embedded_chunk.author_full_name) 62 | 63 | for value in metadata.values(): 64 | if isinstance(value, dict) and "authors" in value: 65 | value["authors"] = list(set(value["authors"])) 66 | 67 | return metadata 68 | -------------------------------------------------------------------------------- /steps/generate_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .create_prompts import create_prompts 2 | from .generate_intruction_dataset import generate_intruction_dataset 3 | from .generate_preference_dataset import generate_preference_dataset 4 | from .push_to_huggingface import push_to_huggingface 5 | from .query_feature_store import query_feature_store 6 | 7 | __all__ = [ 8 | "generate_intruction_dataset", 9 | "generate_preference_dataset", 10 | "create_prompts", 11 | "push_to_huggingface", 12 | "query_feature_store", 13 | ] 14 | -------------------------------------------------------------------------------- /steps/generate_datasets/create_prompts.py: -------------------------------------------------------------------------------- 1 | from typing_extensions import Annotated 2 | from zenml import get_step_context, step 3 | 4 | from llm_engineering.application.dataset import generation 5 | from llm_engineering.domain.dataset import DatasetType 6 | from llm_engineering.domain.prompt import GenerateDatasetSamplesPrompt 7 | from llm_engineering.domain.types import DataCategory 8 | 9 | 10 | @step 11 | def create_prompts( 12 | documents: Annotated[list, "queried_cleaned_documents"], 13 | dataset_type: Annotated[DatasetType, "dataset_type"], 14 | ) -> Annotated[dict[DataCategory, list[GenerateDatasetSamplesPrompt]], "prompts"]: 15 | dataset_generator = generation.get_dataset_generator(dataset_type) 16 | grouped_prompts = dataset_generator.get_prompts(documents) 17 | 18 | step_context = get_step_context() 19 | step_context.add_output_metadata(output_name="prompts", metadata=_get_metadata(grouped_prompts)) 20 | 21 | return grouped_prompts 22 | 23 | 24 | def _get_metadata(grouped_prompts: dict[DataCategory, list[GenerateDatasetSamplesPrompt]]) -> dict: 25 | prompt_categories = list(grouped_prompts.keys()) 26 | prompt_num_samples = {category: len(prompts) for category, prompts in grouped_prompts.items()} 27 | 28 | return {"data_categories": prompt_categories, "data_categories_num_prompts": prompt_num_samples} 29 | -------------------------------------------------------------------------------- /steps/generate_datasets/generate_intruction_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from typing_extensions import Annotated 4 | from zenml import ArtifactConfig, get_step_context, step 5 | 6 | from llm_engineering.application.dataset import generation 7 | from llm_engineering.domain.dataset import DatasetType, InstructTrainTestSplit 8 | from llm_engineering.domain.prompt import GenerateDatasetSamplesPrompt 9 | from llm_engineering.domain.types import DataCategory 10 | 11 | 12 | @step 13 | def generate_intruction_dataset( 14 | prompts: Annotated[dict[DataCategory, list[GenerateDatasetSamplesPrompt]], "prompts"], 15 | test_split_size: Annotated[float, "test_split_size"], 16 | mock: Annotated[bool, "mock_generation"] = False, 17 | ) -> Annotated[ 18 | InstructTrainTestSplit, 19 | ArtifactConfig( 20 | name="instruct_datasets", 21 | tags=["dataset", "instruct", "cleaned"], 22 | ), 23 | ]: 24 | dataset_generator = generation.get_dataset_generator(DatasetType.INSTRUCTION) 25 | datasets = dataset_generator.generate(prompts, test_size=test_split_size, mock=mock) 26 | 27 | step_context = get_step_context() 28 | step_context.add_output_metadata(output_name="instruct_datasets", metadata=_get_metadata_instruct_dataset(datasets)) 29 | 30 | return datasets 31 | 32 | 33 | def _get_metadata_instruct_dataset(datasets: InstructTrainTestSplit) -> dict[str, Any]: 34 | instruct_dataset_categories = list(datasets.train.keys()) 35 | train_num_samples = { 36 | category: instruct_dataset.num_samples for category, instruct_dataset in datasets.train.items() 37 | } 38 | test_num_samples = {category: instruct_dataset.num_samples for category, instruct_dataset in datasets.test.items()} 39 | 40 | return { 41 | "data_categories": instruct_dataset_categories, 42 | "test_split_size": datasets.test_split_size, 43 | "train_num_samples_per_category": train_num_samples, 44 | "test_num_samples_per_category": test_num_samples, 45 | } 46 | -------------------------------------------------------------------------------- /steps/generate_datasets/generate_preference_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from typing_extensions import Annotated 4 | from zenml import ArtifactConfig, get_step_context, step 5 | 6 | from llm_engineering.application.dataset import generation 7 | from llm_engineering.domain.dataset import DatasetType, PreferenceTrainTestSplit 8 | from llm_engineering.domain.prompt import GenerateDatasetSamplesPrompt 9 | from llm_engineering.domain.types import DataCategory 10 | 11 | 12 | @step 13 | def generate_preference_dataset( 14 | prompts: Annotated[dict[DataCategory, list[GenerateDatasetSamplesPrompt]], "prompts"], 15 | test_split_size: Annotated[float, "test_split_size"], 16 | mock: Annotated[bool, "mock_generation"] = False, 17 | ) -> Annotated[ 18 | PreferenceTrainTestSplit, 19 | ArtifactConfig( 20 | name="preference_datasets", 21 | tags=["dataset", "preference", "cleaned"], 22 | ), 23 | ]: 24 | dataset_generator = generation.get_dataset_generator(DatasetType.PREFERENCE) 25 | datasets = dataset_generator.generate(prompts, test_size=test_split_size, mock=mock) 26 | 27 | step_context = get_step_context() 28 | step_context.add_output_metadata( 29 | output_name="preference_datasets", metadata=_get_metadata_preference_dataset(datasets) 30 | ) 31 | 32 | return datasets 33 | 34 | 35 | def _get_metadata_preference_dataset(datasets: PreferenceTrainTestSplit) -> dict[str, Any]: 36 | instruct_dataset_categories = list(datasets.train.keys()) 37 | train_num_samples = { 38 | category: instruct_dataset.num_samples for category, instruct_dataset in datasets.train.items() 39 | } 40 | test_num_samples = {category: instruct_dataset.num_samples for category, instruct_dataset in datasets.test.items()} 41 | 42 | return { 43 | "data_categories": instruct_dataset_categories, 44 | "test_split_size": datasets.test_split_size, 45 | "train_num_samples_per_category": train_num_samples, 46 | "test_num_samples_per_category": test_num_samples, 47 | } 48 | -------------------------------------------------------------------------------- /steps/generate_datasets/push_to_huggingface.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from typing_extensions import Annotated 3 | from zenml import step 4 | 5 | from llm_engineering.domain.dataset import InstructTrainTestSplit, PreferenceTrainTestSplit 6 | from llm_engineering.settings import settings 7 | 8 | 9 | @step 10 | def push_to_huggingface( 11 | dataset: Annotated[InstructTrainTestSplit | PreferenceTrainTestSplit, "dataset_split"], 12 | dataset_id: Annotated[str, "dataset_id"], 13 | ) -> None: 14 | assert dataset_id is not None, "Dataset id must be provided for pushing to Huggingface" 15 | assert ( 16 | settings.HUGGINGFACE_ACCESS_TOKEN is not None 17 | ), "Huggingface access token must be provided for pushing to Huggingface" 18 | 19 | logger.info(f"Pushing dataset {dataset_id} to Hugging Face.") 20 | 21 | huggingface_dataset = dataset.to_huggingface(flatten=True) 22 | huggingface_dataset.push_to_hub(dataset_id, token=settings.HUGGINGFACE_ACCESS_TOKEN) 23 | -------------------------------------------------------------------------------- /steps/generate_datasets/query_feature_store.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor, as_completed 2 | 3 | from loguru import logger 4 | from qdrant_client.http import exceptions 5 | from typing_extensions import Annotated 6 | from zenml import step 7 | 8 | from llm_engineering.domain.base.nosql import NoSQLBaseDocument 9 | from llm_engineering.domain.cleaned_documents import ( 10 | CleanedArticleDocument, 11 | CleanedDocument, 12 | CleanedPostDocument, 13 | CleanedRepositoryDocument, 14 | ) 15 | 16 | 17 | @step 18 | def query_feature_store() -> Annotated[list, "queried_cleaned_documents"]: 19 | logger.info("Querying feature store.") 20 | 21 | results = fetch_all_data() 22 | 23 | cleaned_documents = [doc for query_result in results.values() for doc in query_result] 24 | 25 | return cleaned_documents 26 | 27 | 28 | def fetch_all_data() -> dict[str, list[NoSQLBaseDocument]]: 29 | with ThreadPoolExecutor() as executor: 30 | future_to_query = { 31 | executor.submit( 32 | __fetch_articles, 33 | ): "articles", 34 | executor.submit( 35 | __fetch_posts, 36 | ): "posts", 37 | executor.submit( 38 | __fetch_repositories, 39 | ): "repositories", 40 | } 41 | 42 | results = {} 43 | for future in as_completed(future_to_query): 44 | query_name = future_to_query[future] 45 | try: 46 | results[query_name] = future.result() 47 | except Exception: 48 | logger.exception(f"'{query_name}' request failed.") 49 | 50 | results[query_name] = [] 51 | 52 | return results 53 | 54 | 55 | def __fetch_articles() -> list[CleanedDocument]: 56 | return __fetch(CleanedArticleDocument) 57 | 58 | 59 | def __fetch_posts() -> list[CleanedDocument]: 60 | return __fetch(CleanedPostDocument) 61 | 62 | 63 | def __fetch_repositories() -> list[CleanedDocument]: 64 | return __fetch(CleanedRepositoryDocument) 65 | 66 | 67 | def __fetch(cleaned_document_type: type[CleanedDocument], limit: int = 1) -> list[CleanedDocument]: 68 | try: 69 | cleaned_documents, next_offset = cleaned_document_type.bulk_find(limit=limit) 70 | except exceptions.UnexpectedResponse: 71 | return [] 72 | 73 | while next_offset: 74 | documents, next_offset = cleaned_document_type.bulk_find(limit=limit, offset=next_offset) 75 | cleaned_documents.extend(documents) 76 | 77 | return cleaned_documents 78 | -------------------------------------------------------------------------------- /steps/training/__init__.py: -------------------------------------------------------------------------------- 1 | from .train import train 2 | 3 | __all__ = ["train"] 4 | -------------------------------------------------------------------------------- /steps/training/train.py: -------------------------------------------------------------------------------- 1 | from zenml import step 2 | 3 | from llm_engineering.model.finetuning.sagemaker import run_finetuning_on_sagemaker 4 | 5 | 6 | @step 7 | def train( 8 | finetuning_type: str, 9 | num_train_epochs: int, 10 | per_device_train_batch_size: int, 11 | learning_rate: float, 12 | dataset_huggingface_workspace: str = "mlabonne", 13 | is_dummy: bool = False, 14 | ) -> None: 15 | run_finetuning_on_sagemaker( 16 | finetuning_type=finetuning_type, 17 | num_train_epochs=num_train_epochs, 18 | per_device_train_batch_size=per_device_train_batch_size, 19 | learning_rate=learning_rate, 20 | dataset_huggingface_workspace=dataset_huggingface_workspace, 21 | is_dummy=is_dummy, 22 | ) 23 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/tests/__init__.py -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/tests/integration/__init__.py -------------------------------------------------------------------------------- /tests/integration/integration_example_test.py: -------------------------------------------------------------------------------- 1 | def test_integration_example() -> None: 2 | string = "integration_test_example" 3 | 4 | assert string == "integration_test_example" 5 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/unit_example_test.py: -------------------------------------------------------------------------------- 1 | def test_unit_example() -> None: 2 | string = "unit_test_example" 3 | 4 | assert string == "unit_test_example" 5 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/tools/__init__.py -------------------------------------------------------------------------------- /tools/data_warehouse.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | import click 5 | from loguru import logger 6 | 7 | from llm_engineering.domain.base.nosql import NoSQLBaseDocument 8 | from llm_engineering.domain.documents import ArticleDocument, PostDocument, RepositoryDocument, UserDocument 9 | 10 | 11 | @click.command() 12 | @click.option( 13 | "--export-raw-data", 14 | is_flag=True, 15 | default=False, 16 | help="Whether to export your data warehouse to a JSON file.", 17 | ) 18 | @click.option( 19 | "--import-raw-data", 20 | is_flag=True, 21 | default=False, 22 | help="Whether to import a JSON file into your data warehouse.", 23 | ) 24 | @click.option( 25 | "--data-dir", 26 | default=Path("data/data_warehouse_raw_data"), 27 | type=Path, 28 | help="Path to the directory containing data warehouse raw data JSON files.", 29 | ) 30 | def main( 31 | export_raw_data, 32 | import_raw_data, 33 | data_dir: Path, 34 | ) -> None: 35 | assert export_raw_data or import_raw_data, "Specify at least one operation." 36 | 37 | if export_raw_data: 38 | __export(data_dir) 39 | 40 | if import_raw_data: 41 | __import(data_dir) 42 | 43 | 44 | def __export(data_dir: Path) -> None: 45 | logger.info(f"Exporting data warehouse to {data_dir}...") 46 | data_dir.mkdir(parents=True, exist_ok=True) 47 | 48 | __export_data_category(data_dir, ArticleDocument) 49 | __export_data_category(data_dir, PostDocument) 50 | __export_data_category(data_dir, RepositoryDocument) 51 | __export_data_category(data_dir, UserDocument) 52 | 53 | 54 | def __export_data_category(data_dir: Path, category_class: type[NoSQLBaseDocument]) -> None: 55 | data = category_class.bulk_find() 56 | serialized_data = [d.to_mongo() for d in data] 57 | export_file = data_dir / f"{category_class.__name__}.json" 58 | 59 | logger.info(f"Exporting {len(serialized_data)} items of {category_class.__name__} to {export_file}...") 60 | with export_file.open("w") as f: 61 | json.dump(serialized_data, f) 62 | 63 | 64 | def __import(data_dir: Path) -> None: 65 | logger.info(f"Importing data warehouse from {data_dir}...") 66 | assert data_dir.is_dir(), f"{data_dir} is not a directory or it doesn't exists." 67 | 68 | data_category_classes = { 69 | "ArticleDocument": ArticleDocument, 70 | "PostDocument": PostDocument, 71 | "RepositoryDocument": RepositoryDocument, 72 | "UserDocument": UserDocument, 73 | } 74 | 75 | for file in data_dir.iterdir(): 76 | if not file.is_file(): 77 | continue 78 | 79 | category_class_name = file.stem 80 | category_class = data_category_classes.get(category_class_name) 81 | if not category_class: 82 | logger.warning(f"Skipping {file} as it does not match any data category.") 83 | continue 84 | 85 | __import_data_category(file, category_class) 86 | 87 | 88 | def __import_data_category(file: Path, category_class: type[NoSQLBaseDocument]) -> None: 89 | with file.open("r") as f: 90 | data = json.load(f) 91 | 92 | logger.info(f"Importing {len(data)} items of {category_class.__name__} from {file}...") 93 | if len(data) > 0: 94 | deserialized_data = [category_class.from_mongo(d) for d in data] 95 | category_class.bulk_insert(deserialized_data) 96 | 97 | 98 | if __name__ == "__main__": 99 | main() 100 | -------------------------------------------------------------------------------- /tools/ml_service.py: -------------------------------------------------------------------------------- 1 | from llm_engineering.infrastructure.inference_pipeline_api import app # noqa 2 | 3 | if __name__ == "__main__": 4 | import uvicorn 5 | 6 | uvicorn.run("tools.ml_service:app", host="0.0.0.0", port=8000, reload=True) 7 | -------------------------------------------------------------------------------- /tools/rag.py: -------------------------------------------------------------------------------- 1 | from langchain.globals import set_verbose 2 | from loguru import logger 3 | 4 | from llm_engineering.application.rag.retriever import ContextRetriever 5 | from llm_engineering.infrastructure.opik_utils import configure_opik 6 | 7 | if __name__ == "__main__": 8 | configure_opik() 9 | set_verbose(True) 10 | 11 | query = """ 12 | My name is Paul Iusztin. 13 | 14 | Could you draft a LinkedIn post discussing RAG systems? 15 | I'm particularly interested in: 16 | - how RAG works 17 | - how it is integrated with vector DBs and large language models (LLMs). 18 | """ 19 | 20 | retriever = ContextRetriever(mock=False) 21 | documents = retriever.search(query, k=9) 22 | 23 | logger.info("Retrieved documents:") 24 | for rank, document in enumerate(documents): 25 | logger.info(f"{rank + 1}: {document}") 26 | -------------------------------------------------------------------------------- /tools/run.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime as dt 2 | from pathlib import Path 3 | 4 | import click 5 | from loguru import logger 6 | 7 | from llm_engineering import settings 8 | from pipelines import ( 9 | digital_data_etl, 10 | end_to_end_data, 11 | evaluating, 12 | export_artifact_to_json, 13 | feature_engineering, 14 | generate_datasets, 15 | training, 16 | ) 17 | 18 | 19 | @click.command( 20 | help=""" 21 | LLM Engineering project CLI v0.0.1. 22 | 23 | Main entry point for the pipeline execution. 24 | This entrypoint is where everything comes together. 25 | 26 | Run the ZenML LLM Engineering project pipelines with various options. 27 | 28 | Run a pipeline with the required parameters. This executes 29 | all steps in the pipeline in the correct order using the orchestrator 30 | stack component that is configured in your active ZenML stack. 31 | 32 | Examples: 33 | 34 | \b 35 | # Run the pipeline with default options 36 | python run.py 37 | 38 | \b 39 | # Run the pipeline without cache 40 | python run.py --no-cache 41 | 42 | \b 43 | # Run only the ETL pipeline 44 | python run.py --only-etl 45 | 46 | """ 47 | ) 48 | @click.option( 49 | "--no-cache", 50 | is_flag=True, 51 | default=False, 52 | help="Disable caching for the pipeline run.", 53 | ) 54 | @click.option( 55 | "--run-end-to-end-data", 56 | is_flag=True, 57 | default=False, 58 | help="Whether to run all the data pipelines in one go.", 59 | ) 60 | @click.option( 61 | "--run-etl", 62 | is_flag=True, 63 | default=False, 64 | help="Whether to run the ETL pipeline.", 65 | ) 66 | @click.option( 67 | "--run-export-artifact-to-json", 68 | is_flag=True, 69 | default=False, 70 | help="Whether to run the Artifact -> JSON pipeline", 71 | ) 72 | @click.option( 73 | "--etl-config-filename", 74 | default="digital_data_etl_paul_iusztin.yaml", 75 | help="Filename of the ETL config file.", 76 | ) 77 | @click.option( 78 | "--run-feature-engineering", 79 | is_flag=True, 80 | default=False, 81 | help="Whether to run the FE pipeline.", 82 | ) 83 | @click.option( 84 | "--run-generate-instruct-datasets", 85 | is_flag=True, 86 | default=False, 87 | help="Whether to run the instruct dataset generation pipeline.", 88 | ) 89 | @click.option( 90 | "--run-generate-preference-datasets", 91 | is_flag=True, 92 | default=False, 93 | help="Whether to run the preference dataset generation pipeline.", 94 | ) 95 | @click.option( 96 | "--run-training", 97 | is_flag=True, 98 | default=False, 99 | help="Whether to run the training pipeline.", 100 | ) 101 | @click.option( 102 | "--run-evaluation", 103 | is_flag=True, 104 | default=False, 105 | help="Whether to run the evaluation pipeline.", 106 | ) 107 | @click.option( 108 | "--export-settings", 109 | is_flag=True, 110 | default=False, 111 | help="Whether to export your settings to ZenML or not.", 112 | ) 113 | def main( 114 | no_cache: bool = False, 115 | run_end_to_end_data: bool = False, 116 | run_etl: bool = False, 117 | etl_config_filename: str = "digital_data_etl_paul_iusztin.yaml", 118 | run_export_artifact_to_json: bool = False, 119 | run_feature_engineering: bool = False, 120 | run_generate_instruct_datasets: bool = False, 121 | run_generate_preference_datasets: bool = False, 122 | run_training: bool = False, 123 | run_evaluation: bool = False, 124 | export_settings: bool = False, 125 | ) -> None: 126 | assert ( 127 | run_end_to_end_data 128 | or run_etl 129 | or run_export_artifact_to_json 130 | or run_feature_engineering 131 | or run_generate_instruct_datasets 132 | or run_generate_preference_datasets 133 | or run_training 134 | or run_evaluation 135 | or export_settings 136 | ), "Please specify an action to run." 137 | 138 | if export_settings: 139 | logger.info("Exporting settings to ZenML secrets.") 140 | settings.export() 141 | 142 | pipeline_args = { 143 | "enable_cache": not no_cache, 144 | } 145 | root_dir = Path(__file__).resolve().parent.parent 146 | 147 | if run_end_to_end_data: 148 | run_args_end_to_end = {} 149 | pipeline_args["config_path"] = root_dir / "configs" / "end_to_end_data.yaml" 150 | assert pipeline_args["config_path"].exists(), f"Config file not found: {pipeline_args['config_path']}" 151 | pipeline_args["run_name"] = f"end_to_end_data_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}" 152 | end_to_end_data.with_options(**pipeline_args)(**run_args_end_to_end) 153 | 154 | if run_etl: 155 | run_args_etl = {} 156 | pipeline_args["config_path"] = root_dir / "configs" / etl_config_filename 157 | assert pipeline_args["config_path"].exists(), f"Config file not found: {pipeline_args['config_path']}" 158 | pipeline_args["run_name"] = f"digital_data_etl_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}" 159 | digital_data_etl.with_options(**pipeline_args)(**run_args_etl) 160 | 161 | if run_export_artifact_to_json: 162 | run_args_etl = {} 163 | pipeline_args["config_path"] = root_dir / "configs" / "export_artifact_to_json.yaml" 164 | assert pipeline_args["config_path"].exists(), f"Config file not found: {pipeline_args['config_path']}" 165 | pipeline_args["run_name"] = f"export_artifact_to_json_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}" 166 | export_artifact_to_json.with_options(**pipeline_args)(**run_args_etl) 167 | 168 | if run_feature_engineering: 169 | run_args_fe = {} 170 | pipeline_args["config_path"] = root_dir / "configs" / "feature_engineering.yaml" 171 | pipeline_args["run_name"] = f"feature_engineering_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}" 172 | feature_engineering.with_options(**pipeline_args)(**run_args_fe) 173 | 174 | if run_generate_instruct_datasets: 175 | run_args_cd = {} 176 | pipeline_args["config_path"] = root_dir / "configs" / "generate_instruct_datasets.yaml" 177 | pipeline_args["run_name"] = f"generate_instruct_datasets_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}" 178 | generate_datasets.with_options(**pipeline_args)(**run_args_cd) 179 | 180 | if run_generate_preference_datasets: 181 | run_args_cd = {} 182 | pipeline_args["config_path"] = root_dir / "configs" / "generate_preference_datasets.yaml" 183 | pipeline_args["run_name"] = f"generate_preference_datasets_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}" 184 | generate_datasets.with_options(**pipeline_args)(**run_args_cd) 185 | 186 | if run_training: 187 | run_args_cd = {} 188 | pipeline_args["config_path"] = root_dir / "configs" / "training.yaml" 189 | pipeline_args["run_name"] = f"training_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}" 190 | training.with_options(**pipeline_args)(**run_args_cd) 191 | 192 | if run_evaluation: 193 | run_args_cd = {} 194 | pipeline_args["config_path"] = root_dir / "configs" / "evaluating.yaml" 195 | pipeline_args["run_name"] = f"evaluation_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}" 196 | evaluating.with_options(**pipeline_args)(**run_args_cd) 197 | 198 | 199 | if __name__ == "__main__": 200 | main() 201 | --------------------------------------------------------------------------------