├── .env.example
├── .github
    └── workflows
    │   ├── cd.yaml
    │   └── ci.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── .vscode
    └── settings.json
├── Dockerfile
├── LICENSE
├── README.md
├── code_snippets
    ├── 03_custom_odm_example.py
    ├── 03_orm.py
    ├── 08_instructor_embeddings.py
    ├── 08_text_embeddings.py
    └── 08_text_image_embeddings.py
├── configs
    ├── digital_data_etl_maxime_labonne.yaml
    ├── digital_data_etl_paul_iusztin.yaml
    ├── end_to_end_data.yaml
    ├── evaluating.yaml
    ├── export_artifact_to_json.yaml
    ├── feature_engineering.yaml
    ├── generate_instruct_datasets.yaml
    ├── generate_preference_datasets.yaml
    └── training.yaml
├── data
    ├── artifacts
    │   ├── cleaned_documents.json
    │   ├── instruct_datasets.json
    │   ├── preference_datasets.json
    │   └── raw_documents.json
    └── data_warehouse_raw_data
    │   ├── ArticleDocument.json
    │   ├── PostDocument.json
    │   ├── RepositoryDocument.json
    │   └── UserDocument.json
├── docker-compose.yml
├── images
    ├── cover_plus.png
    └── crazy_cat.jpg
├── llm_engineering
    ├── __init__.py
    ├── application
    │   ├── __init__.py
    │   ├── crawlers
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── custom_article.py
    │   │   ├── dispatcher.py
    │   │   ├── github.py
    │   │   ├── linkedin.py
    │   │   └── medium.py
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   ├── generation.py
    │   │   ├── output_parsers.py
    │   │   └── utils.py
    │   ├── networks
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── embeddings.py
    │   ├── preprocessing
    │   │   ├── __init__.py
    │   │   ├── chunking_data_handlers.py
    │   │   ├── cleaning_data_handlers.py
    │   │   ├── dispatchers.py
    │   │   ├── embedding_data_handlers.py
    │   │   └── operations
    │   │   │   ├── __init__.py
    │   │   │   ├── chunking.py
    │   │   │   └── cleaning.py
    │   ├── rag
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── prompt_templates.py
    │   │   ├── query_expanison.py
    │   │   ├── reranking.py
    │   │   ├── retriever.py
    │   │   └── self_query.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── misc.py
    │   │   └── split_user_full_name.py
    ├── domain
    │   ├── __init__.py
    │   ├── base
    │   │   ├── __init__.py
    │   │   ├── nosql.py
    │   │   └── vector.py
    │   ├── chunks.py
    │   ├── cleaned_documents.py
    │   ├── dataset.py
    │   ├── documents.py
    │   ├── embedded_chunks.py
    │   ├── exceptions.py
    │   ├── inference.py
    │   ├── prompt.py
    │   ├── queries.py
    │   └── types.py
    ├── infrastructure
    │   ├── __init__.py
    │   ├── aws
    │   │   ├── __init__.py
    │   │   ├── deploy
    │   │   │   ├── __init__.py
    │   │   │   ├── autoscaling_sagemaker_endpoint.py
    │   │   │   ├── delete_sagemaker_endpoint.py
    │   │   │   └── huggingface
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── config.py
    │   │   │   │   ├── run.py
    │   │   │   │   └── sagemaker_huggingface.py
    │   │   └── roles
    │   │   │   ├── create_execution_role.py
    │   │   │   └── create_sagemaker_role.py
    │   ├── db
    │   │   ├── mongo.py
    │   │   └── qdrant.py
    │   ├── files_io.py
    │   ├── inference_pipeline_api.py
    │   └── opik_utils.py
    ├── model
    │   ├── Readme.md
    │   ├── __init__.py
    │   ├── evaluation
    │   │   ├── __init__.py
    │   │   ├── evaluate.py
    │   │   ├── requirements.txt
    │   │   └── sagemaker.py
    │   ├── finetuning
    │   │   ├── __init__.py
    │   │   ├── finetune.py
    │   │   ├── requirements.txt
    │   │   └── sagemaker.py
    │   ├── inference
    │   │   ├── __init__.py
    │   │   ├── inference.py
    │   │   ├── run.py
    │   │   └── test.py
    │   └── utils.py
    └── settings.py
├── pipelines
    ├── __init__.py
    ├── digital_data_etl.py
    ├── end_to_end_data.py
    ├── evaluating.py
    ├── export_artifact_to_json.py
    ├── feature_engineering.py
    ├── generate_datasets.py
    └── training.py
├── poetry.lock
├── pyproject.toml
├── ruff.toml
├── steps
    ├── __init__.py
    ├── etl
    │   ├── __init__.py
    │   ├── crawl_links.py
    │   └── get_or_create_user.py
    ├── evaluating
    │   ├── __init__.py
    │   └── evaluate.py
    ├── export
    │   ├── __init__.py
    │   ├── serialize_artifact.py
    │   └── to_json.py
    ├── feature_engineering
    │   ├── __init__.py
    │   ├── clean.py
    │   ├── load_to_vector_db.py
    │   ├── query_data_warehouse.py
    │   └── rag.py
    ├── generate_datasets
    │   ├── __init__.py
    │   ├── create_prompts.py
    │   ├── generate_intruction_dataset.py
    │   ├── generate_preference_dataset.py
    │   ├── push_to_huggingface.py
    │   └── query_feature_store.py
    └── training
    │   ├── __init__.py
    │   └── train.py
├── tests
    ├── __init__.py
    ├── integration
    │   ├── __init__.py
    │   └── integration_example_test.py
    └── unit
    │   ├── __init__.py
    │   └── unit_example_test.py
└── tools
    ├── __init__.py
    ├── data_warehouse.py
    ├── ml_service.py
    ├── rag.py
    └── run.py


/.env.example:
--------------------------------------------------------------------------------
 1 | # --- Required settings even when working locally. ---
 2 | 
 3 | # OpenAI API Config
 4 | OPENAI_MODEL_ID=gpt-4o-mini
 5 | OPENAI_API_KEY=str
 6 | 
 7 | # Huggingface API Config
 8 | HUGGINGFACE_ACCESS_TOKEN=str
 9 | 
10 | # Comet ML (during training and inference) 
11 | COMET_API_KEY=str
12 | 
13 | # --- Required settings when deploying the code. ---
14 | # --- Otherwise, default values work fine. ---
15 | 
16 | # MongoDB database
17 | DATABASE_HOST="mongodb://llm_engineering:llm_engineering@127.0.0.1:27017"
18 | 
19 | # Qdrant vector database
20 | USE_QDRANT_CLOUD=false
21 | QDRANT_CLOUD_URL=str
22 | QDRANT_APIKEY=str
23 | 
24 | # AWS Authentication
25 | AWS_ARN_ROLE=str
26 | AWS_REGION=eu-central-1
27 | AWS_ACCESS_KEY=str
28 | AWS_SECRET_KEY=str
29 | 


--------------------------------------------------------------------------------
/.github/workflows/cd.yaml:
--------------------------------------------------------------------------------
 1 | name: CD
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | concurrency:
 9 |     group: ${{ github.workflow }}-${{ github.ref }}
10 |     cancel-in-progress: true
11 | 
12 | jobs:
13 |   build:
14 |     name: Build & Push Docker Image
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout Code
18 |         uses: actions/checkout@v3
19 | 
20 |       - name: Set up Docker Buildx
21 |         uses: docker/setup-buildx-action@v3
22 | 
23 |       - name: Configure AWS credentials
24 |         uses: aws-actions/configure-aws-credentials@v1
25 |         with:
26 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
27 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
28 |           aws-region: ${{ secrets.AWS_REGION }}
29 | 
30 |       - name: Login to Amazon ECR
31 |         id: login-ecr
32 |         uses: aws-actions/amazon-ecr-login@v1
33 | 
34 |       - name: Build images & push to ECR
35 |         id: build-image
36 |         uses: docker/build-push-action@v6
37 |         with:
38 |           context: .
39 |           file: ./Dockerfile
40 |           tags: |
41 |             ${{ steps.login-ecr.outputs.registry }}/${{ secrets.AWS_ECR_NAME }}:${{ github.sha }}
42 |             ${{ steps.login-ecr.outputs.registry }}/${{ secrets.AWS_ECR_NAME }}:latest
43 |           push: true
44 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     
 6 | concurrency:
 7 |   group: ${{ github.workflow }}-${{ github.ref }}
 8 |   cancel-in-progress: true
 9 | 
10 | jobs:
11 |   qa:
12 |     name: QA
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       - name: Checkout
17 |         uses: actions/checkout@v3
18 | 
19 |       - name: Setup Python
20 |         uses: actions/setup-python@v3
21 |         with:
22 |           python-version: "3.11"
23 | 
24 |       - name: Install poetry
25 |         uses: abatilo/actions-poetry@v2
26 |         with:
27 |           poetry-version: 1.8.3
28 | 
29 |       - name: Install packages
30 |         run: |
31 |           poetry install --only dev
32 |           poetry self add 'poethepoet[poetry_plugin]'
33 | 
34 |       - name: gitleaks check
35 |         run: poetry poe gitleaks-check
36 | 
37 |       - name: Lint check [Python]
38 |         run: poetry poe lint-check
39 | 
40 |       - name: Format check [Python]
41 |         run: poetry poe format-check
42 | 
43 |   test:
44 |     name: Test
45 |     runs-on: ubuntu-latest
46 | 
47 |     steps:
48 |       - name: Checkout
49 |         uses: actions/checkout@v3
50 | 
51 |       - name: Setup Python
52 |         uses: actions/setup-python@v3
53 |         with:
54 |           python-version: "3.11"
55 | 
56 |       - name: Install poetry
57 |         uses: abatilo/actions-poetry@v2
58 |         with:
59 |           poetry-version: 1.8.3
60 | 
61 |       - name: Install packages
62 |         run: |
63 |           poetry install
64 |           poetry self add 'poethepoet[poetry_plugin]'
65 | 
66 |       - name: Run tests
67 |         run: |
68 |           echo "Running tests..."
69 |           poetry poe test
70 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # IDEs
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | # MacOs
165 | .DS_Store
166 | 
167 | # VS Code
168 | .vscode/**/launch.json
169 | 
170 | # Data
171 | output/
172 | sagemaker_*.json
173 | run_ids.txt
174 | 
175 | # Virtual environments
176 | *_venv
177 | *_myenv
178 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |     rev: v0.3.5
 4 |     hooks:
 5 |       - id: ruff # Run the linter.
 6 |       - id: ruff-format # Run the formatter.
 7 |   - repo: https://github.com/gitleaks/gitleaks
 8 |     rev: v8.18.2
 9 |     hooks:
10 |       - id: gitleaks
11 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.8
2 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "[python]": {
 3 |         "editor.formatOnSave": true,
 4 |         "editor.codeActionsOnSave": {
 5 |             "source.fixAll": "explicit",
 6 |             "source.organizeImports": "explicit"
 7 |         },
 8 |         "editor.defaultFormatter": "charliermarsh.ruff"
 9 |     },
10 |     "notebook.formatOnSave.enabled": true,
11 |     "notebook.codeActionsOnSave": {
12 |         "notebook.source.fixAll": "explicit",
13 |         "notebook.source.organizeImports": "explicit"
14 |     },
15 | }


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim-bullseye AS release
 2 | 
 3 | ENV WORKSPACE_ROOT=/app/
 4 | ENV PYTHONDONTWRITEBYTECODE=1
 5 | ENV PYTHONUNBUFFERED=1
 6 | ENV POETRY_VERSION=1.8.3
 7 | ENV DEBIAN_FRONTEND=noninteractive
 8 | ENV POETRY_NO_INTERACTION=1
 9 | 
10 | # Install Google Chrome
11 | RUN apt-get update -y && \
12 |     apt-get install -y gnupg wget curl --no-install-recommends && \
13 |     wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /usr/share/keyrings/google-linux-signing-key.gpg && \
14 |     echo "deb [signed-by=/usr/share/keyrings/google-linux-signing-key.gpg] https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
15 |     apt-get update -y && \
16 |     apt-get install -y google-chrome-stable && \
17 |     rm -rf /var/lib/apt/lists/*
18 | 
19 | # Install other system dependencies.
20 | RUN apt-get update -y \
21 |     && apt-get install -y --no-install-recommends build-essential \
22 |     gcc \
23 |     python3-dev \
24 |     build-essential \
25 |     libglib2.0-dev \
26 |     libnss3-dev \
27 |     && apt-get clean \
28 |     && rm -rf /var/lib/apt/lists/*
29 | 
30 | # Install Poetry using pip and clear cache
31 | RUN pip install --no-cache-dir "poetry==$POETRY_VERSION"
32 | RUN poetry config installer.max-workers 20
33 | 
34 | WORKDIR $WORKSPACE_ROOT
35 | 
36 | # Copy the poetry lock file and pyproject.toml file to install dependencies
37 | COPY pyproject.toml poetry.lock $WORKSPACE_ROOT
38 | 
39 | # Install the dependencies and clear cache
40 | RUN poetry config virtualenvs.create false && \
41 |     poetry install --no-root --no-interaction --no-cache --without dev && \
42 |     poetry self add 'poethepoet[poetry_plugin]' && \
43 |     rm -rf ~/.cache/pypoetry/cache/ && \
44 |     rm -rf ~/.cache/pypoetry/artifacts/
45 | 
46 | # Copy the rest of the code.
47 | COPY . $WORKSPACE_ROOT
48 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/code_snippets/03_custom_odm_example.py:
--------------------------------------------------------------------------------
 1 | from llm_engineering.domain.documents import ArticleDocument, UserDocument
 2 | 
 3 | if __name__ == "__main__":
 4 |     user = UserDocument.get_or_create(first_name="Paul", last_name="Iusztin")
 5 |     articles = ArticleDocument.bulk_find(author_id=str(user.id))
 6 | 
 7 |     print(f"User ID: {user.id}")  # noqa
 8 |     print(f"User name: {user.first_name} {user.last_name}")  # noqa
 9 |     print(f"Number of articles: {len(articles)}")  # noqa
10 |     print("First article link:", articles[0].link)  # noqa
11 | 


--------------------------------------------------------------------------------
/code_snippets/03_orm.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import Column, Integer, String, create_engine
 2 | from sqlalchemy.orm import declarative_base, sessionmaker
 3 | 
 4 | # Create virtual environment, install dependencies and run the code:
 5 | # 1. Create: python3 -m venv orm_venv
 6 | # 2. Activate: source orm_venv/bin/activate
 7 | # 3. Install: pip install sqlalchemy==2.0.35
 8 | # 4. Run the code: python code_snippets/03_orm.py
 9 | 
10 | if __name__ == "__main__":
11 |     Base = declarative_base()
12 | 
13 |     # Define  a class that maps to the users table.
14 |     class User(Base):
15 |         __tablename__ = "users"
16 | 
17 |         id = Column(Integer, primary_key=True)
18 |         name = Column(String)
19 | 
20 |     # Create an SQLite database in memory.
21 |     engine = create_engine("sqlite:///:memory:")
22 |     Base.metadata.create_all(engine)
23 | 
24 |     # Create a session used to interact with the database.
25 |     Session = sessionmaker(bind=engine)
26 |     session = Session()
27 | 
28 |     # Add a new user.
29 |     new_user = User(name="Alice")
30 |     session.add(new_user)
31 |     session.commit()
32 | 
33 |     # Query the database.
34 |     user = session.query(User).first()
35 |     if user:
36 |         print(f"User ID: {user.id}")  # noqa
37 |         print(f"User name: {user.name}")  # noqa
38 | 


--------------------------------------------------------------------------------
/code_snippets/08_instructor_embeddings.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers import SentenceTransformer
 2 | 
 3 | # Create virtual environment, install dependencies and run the code:
 4 | # 1. Create: python3 -m venv instructor_venv
 5 | # 2. Activate: source instructor_venv/bin/activate
 6 | # 3. Install: pip install sentence-transformers==3.3.0
 7 | # 4. Run the code: python code_snippets/08_instructor_embeddings.py
 8 | 
 9 | if __name__ == "__main__":
10 |     model = SentenceTransformer("hkunlp/instructor-base")
11 | 
12 |     sentence = "RAG Fundamentals First"
13 | 
14 |     instruction = "Represent the title of an article about AI:"
15 | 
16 |     embeddings = model.encode([[instruction, sentence]])
17 |     print(embeddings.shape)  # noqa
18 |     # Output: (1, 768)
19 | 


--------------------------------------------------------------------------------
/code_snippets/08_text_embeddings.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers import SentenceTransformer
 2 | 
 3 | # Leverage the Poetry virtual environment to run the code:
 4 | # poetry run python code_snippets/08_text_embeddings.py
 5 | 
 6 | if __name__ == "__main__":
 7 |     # 1. Load a pretrained Sentence Transformer model.
 8 |     model = SentenceTransformer("all-MiniLM-L6-v2")
 9 | 
10 |     # The sentences to encode.
11 |     sentences = ["The dog sits outside waiting for a treat.", "I am going swimming.", "The dog is swimming."]
12 | 
13 |     # 2. Calculate embeddings.
14 |     embeddings = model.encode(sentences)
15 |     print(embeddings.shape)  # noqa
16 |     # Output: [3, 384]
17 | 
18 |     # 3. Calculate the embedding similarities using cosine similarity.
19 |     similarities = model.similarity(embeddings, embeddings)
20 |     print(similarities)  # noqa
21 |     # Output:
22 |     # tensor([[ 1.0000, -0.0389,  0.2692],
23 |     #     [-0.0389,  1.0000,  0.3837],
24 |     #     [ 0.2692,  0.3837,  1.0000]])
25 |     #
26 |     # similarities[0, 0] = The similarity between the first sentence and itself.
27 |     # similarities[0, 1] = The similarity between the first and second sentence.
28 |     # similarities[2, 1] = The similarity between the third and second sentence.
29 | 


--------------------------------------------------------------------------------
/code_snippets/08_text_image_embeddings.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | 
 3 | import requests
 4 | from PIL import Image
 5 | from sentence_transformers import SentenceTransformer
 6 | 
 7 | # Leverage the Poetry virtual environment to run the code:
 8 | # poetry run python code_snippets/08_text_image_embeddings.py
 9 | 
10 | if __name__ == "__main__":
11 |     # Load an image with a crazy cat.
12 |     response = requests.get(
13 |         "https://github.com/PacktPublishing/LLM-Engineering/blob/main/images/crazy_cat.jpg?raw=true"
14 |     )
15 |     image = Image.open(BytesIO(response.content))
16 | 
17 |     # Load CLIP model.
18 |     model = SentenceTransformer("clip-ViT-B-32")
19 | 
20 |     # Encode the loaded image.
21 |     img_emb = model.encode(image)
22 | 
23 |     # Encode text descriptions.
24 |     text_emb = model.encode(
25 |         [
26 |             "A crazy cat smiling.",
27 |             "A white and brown cat with a yellow bandana.",
28 |             "A man eating in the garden.",
29 |         ]
30 |     )
31 |     print(text_emb.shape)  # noqa
32 |     # Output: (3, 512)
33 | 
34 |     # Compute similarities.
35 |     similarity_scores = model.similarity(img_emb, text_emb)
36 |     print(similarity_scores)  # noqa
37 |     # Output: tensor([[0.3068, 0.3300, 0.1719]])
38 | 


--------------------------------------------------------------------------------
/configs/digital_data_etl_maxime_labonne.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   docker:
 3 |     parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
 4 |     skip_build: True
 5 |   orchestrator.sagemaker:
 6 |     synchronous: false
 7 |     
 8 | parameters:
 9 |   user_full_name: Maxime Labonne # [First Name(s)] [Last Name]
10 |   links:
11 |     # Personal Blog
12 |     - https://mlabonne.github.io/blog/posts/2024-07-29_Finetune_Llama31.html
13 |     - https://mlabonne.github.io/blog/posts/2024-07-15_The_Rise_of_Agentic_Data_Generation.html
14 |     # Substack
15 |     - https://maximelabonne.substack.com/p/uncensor-any-llm-with-abliteration-d30148b7d43e
16 |     - https://maximelabonne.substack.com/p/create-mixtures-of-experts-with-mergekit-11b318c99562
17 |     - https://maximelabonne.substack.com/p/merge-large-language-models-with-mergekit-2118fb392b54
18 |     - https://maximelabonne.substack.com/p/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac
19 |     - https://maximelabonne.substack.com/p/exllamav2-the-fastest-library-to-run-llms-32aeda294d26
20 |     - https://maximelabonne.substack.com/p/quantize-llama-models-with-ggml-and-llama-cpp-3612dfbcc172
21 |     - https://maximelabonne.substack.com/p/a-beginners-guide-to-llm-fine-tuning-4bae7d4da672
22 |     - https://maximelabonne.substack.com/p/graph-convolutional-networks-introduction-to-gnns-24b3f60d6c95
23 |     - https://maximelabonne.substack.com/p/4-bit-quantization-with-gptq-36b0f4f02c34
24 |     - https://maximelabonne.substack.com/p/fine-tune-your-own-llama-2-model-in-a-colab-notebook-df9823a04a32
25 |     - https://maximelabonne.substack.com/p/introduction-to-weight-quantization-2494701b9c0c
26 |     - https://maximelabonne.substack.com/p/decoding-strategies-in-large-language-models-9733a8f70539
27 |     - https://maximelabonne.substack.com/p/the-art-of-spending-optimizing-your-marketing-budget-with-nonlinear-optimization-6c8a39afb3c2
28 |     - https://maximelabonne.substack.com/p/create-a-bot-to-find-diamonds-in-minecraft-d836606a993a
29 |     - https://maximelabonne.substack.com/p/constraint-programming-67ac16fa0c81
30 |     - https://maximelabonne.substack.com/p/how-to-design-the-most-powerful-graph-neural-network-3d18b07a6e66
31 |     - https://maximelabonne.substack.com/p/introduction-to-graphsage-in-python-a9e7f9ecf9d7
32 |     - https://maximelabonne.substack.com/p/graph-attention-networks-in-python-975736ac5c0c
33 |     - https://maximelabonne.substack.com/p/integer-programming-vs-linear-programming-in-python-f1be5bb4e60e
34 |     - https://maximelabonne.substack.com/p/introduction-to-linear-programming-in-python-9261e7eb44b
35 |     - https://maximelabonne.substack.com/p/what-is-a-tensor-in-deep-learning-6dedd95d6507
36 |     - https://maximelabonne.substack.com/p/efficiently-iterating-over-rows-in-a-pandas-dataframe-7dd5f9992c01
37 |     - https://maximelabonne.substack.com/p/q-learning-for-beginners-2837b777741
38 |     - https://maximelabonne.substack.com/p/how-to-start-machine-learning-for-developers-in-2022-390af12b193f
39 | 


--------------------------------------------------------------------------------
/configs/digital_data_etl_paul_iusztin.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   docker:
 3 |     parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
 4 |     skip_build: True
 5 |   orchestrator.sagemaker:
 6 |     synchronous: false
 7 |     
 8 | parameters:
 9 |   user_full_name: Paul Iusztin # [First Name(s)] [Last Name]
10 |   links:
11 |     # Medium (only articles that are not under the paid wall work)
12 |     - https://medium.com/decodingml/an-end-to-end-framework-for-production-ready-llm-systems-by-building-your-llm-twin-2cc6bb01141f
13 |     - https://medium.com/decodingml/a-real-time-retrieval-system-for-rag-on-social-media-data-9cc01d50a2a0
14 |     - https://medium.com/decodingml/sota-python-streaming-pipelines-for-fine-tuning-llms-and-rag-in-real-time-82eb07795b87
15 |     - https://medium.com/decodingml/the-4-advanced-rag-algorithms-you-must-know-to-implement-5d0c7f1199d2
16 |     - https://medium.com/decodingml/architect-scalable-and-cost-effective-llm-rag-inference-pipelines-73b94ef82a99
17 |     # Substack
18 |     - https://decodingml.substack.com/p/real-time-feature-pipelines-with?r=1ttoeh
19 |     - https://decodingml.substack.com/p/building-ml-systems-the-right-way?r=1ttoeh
20 |     - https://decodingml.substack.com/p/reduce-your-pytorchs-code-latency?r=1ttoeh
21 |     - https://decodingml.substack.com/p/llm-agents-demystified?r=1ttoeh
22 |     - https://decodingml.substack.com/p/scalable-rag-ingestion-pipeline-using?r=1ttoeh
23 |     - https://decodingml.substack.com/p/the-ultimate-mlops-tool?r=1ttoeh
24 |     - https://decodingml.substack.com/p/the-new-king-of-infrastructure-as?r=1ttoeh
25 |     - https://decodingml.substack.com/p/highly-scalable-data-ingestion-architecture?r=1ttoeh
26 |     - https://decodingml.substack.com/p/2-key-llmops-concepts?r=1ttoeh
27 |     - https://decodingml.substack.com/p/the-llm-twin-free-course-on-production?r=1ttoeh
28 |     - https://decodingml.substack.com/p/a-blueprint-for-designing-production?r=1ttoeh
29 |     - https://decodingml.substack.com/p/the-difference-between-development?r=1ttoeh
30 |     - https://decodingml.substack.com/p/architect-scalable-and-cost-effective?r=1ttoeh
31 |     - https://decodingml.substack.com/p/7-tips-to-reduce-your-vram-when-training?r=1ttoeh
32 |     - https://decodingml.substack.com/p/using-this-python-package-you-can?r=1ttoeh
33 |     - https://decodingml.substack.com/p/the-4-advanced-rag-algorithms-you?r=1ttoeh
34 |     - https://decodingml.substack.com/p/problems-deploying-your-ml-models?r=1ttoeh
35 |     - https://decodingml.substack.com/p/sota-python-streaming-pipelines-for?r=1ttoeh
36 |     - https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh
37 |     - https://decodingml.substack.com/p/my-ml-monthly-learning-resource-recommendations?r=1ttoeh
38 |     - https://decodingml.substack.com/p/an-end-to-end-framework-for-production?r=1ttoeh
39 |     - https://decodingml.substack.com/p/upskill-your-llm-knowledge-base-with?r=1ttoeh
40 |     - https://decodingml.substack.com/p/want-to-learn-an-end-to-end-framework?r=1ttoeh
41 |     - https://decodingml.substack.com/p/my-favorite-way-to-implement-a-configuration?r=1ttoeh
42 |     - https://decodingml.substack.com/p/a-real-time-retrieval-system-for?r=1ttoeh
43 |     - https://decodingml.substack.com/p/4-key-decoding-strategies-for-llms?r=1ttoeh
44 |     - https://decodingml.substack.com/p/dml-new-year-the-new-and-improved?r=1ttoeh
45 |     - https://decodingml.substack.com/p/dml-8-types-of-mlops-tools-that-must?r=1ttoeh
46 |     - https://decodingml.substack.com/p/dml-this-is-what-you-need-to-build?r=1ttoeh
47 |     - https://decodingml.substack.com/p/dml-7-steps-on-how-to-fine-tune-an?r=1ttoeh
48 |     - https://decodingml.substack.com/p/dml-how-do-you-generate-a-q-and-a?r=1ttoeh
49 |     - https://decodingml.substack.com/p/dml-what-do-you-need-to-fine-tune?r=1ttoeh
50 |     - https://decodingml.substack.com/p/dml-why-and-when-do-you-need-to-fine?r=1ttoeh
51 |     - https://decodingml.substack.com/p/dml-how-to-implement-a-streaming?r=1ttoeh
52 |     - https://decodingml.substack.com/p/dml-why-and-what-do-you-need-a-streaming?r=1ttoeh
53 |     - https://decodingml.substack.com/p/dml-unwrapping-the-3-pipeline-design?r=1ttoeh
54 |     - https://decodingml.substack.com/p/dml-how-to-design-an-llm-system-for?r=1ttoeh
55 |     - https://decodingml.substack.com/p/dml-synced-vector-dbs-a-guide-to?r=1ttoeh
56 |     - https://decodingml.substack.com/p/dml-what-is-the-difference-between?r=1ttoeh
57 |     - https://decodingml.substack.com/p/dml-7-steps-to-build-a-production?r=1ttoeh
58 |     - https://decodingml.substack.com/p/dml-chain-of-thought-reasoning-write?r=1ttoeh
59 |     - https://decodingml.substack.com/p/dml-build-and-serve-a-production?r=1ttoeh
60 |     - https://decodingml.substack.com/p/dml-4-key-ideas-you-must-know-to?r=1ttoeh
61 |     - https://decodingml.substack.com/p/dml-how-to-add-real-time-monitoring?r=1ttoeh
62 |     - https://decodingml.substack.com/p/dml-top-6-ml-platform-features-you?r=1ttoeh


--------------------------------------------------------------------------------
/configs/end_to_end_data.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   docker:
 3 |     parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
 4 |     skip_build: True
 5 |   orchestrator.sagemaker:
 6 |     synchronous: false
 7 | 
 8 | parameters:
 9 |   # Data ETL & Feature engineering pipelines parameters
10 |   author_links:
11 |     - user_full_name: Paul Iusztin # [First Name(s)] [Last Name]
12 |       links:
13 |         # Medium (only articles that are not under the paid wall work)
14 |         - https://medium.com/decodingml/an-end-to-end-framework-for-production-ready-llm-systems-by-building-your-llm-twin-2cc6bb01141f
15 |         - https://medium.com/decodingml/a-real-time-retrieval-system-for-rag-on-social-media-data-9cc01d50a2a0
16 |         - https://medium.com/decodingml/sota-python-streaming-pipelines-for-fine-tuning-llms-and-rag-in-real-time-82eb07795b87
17 |         - https://medium.com/decodingml/the-4-advanced-rag-algorithms-you-must-know-to-implement-5d0c7f1199d2
18 |         - https://medium.com/decodingml/architect-scalable-and-cost-effective-llm-rag-inference-pipelines-73b94ef82a99
19 |         # Substack
20 |         - https://decodingml.substack.com/p/a-blueprint-for-designing-production?r=1ttoeh
21 |         - https://decodingml.substack.com/p/the-difference-between-development?r=1ttoeh
22 |         - https://decodingml.substack.com/p/architect-scalable-and-cost-effective?r=1ttoeh
23 |         - https://decodingml.substack.com/p/7-tips-to-reduce-your-vram-when-training?r=1ttoeh
24 |         - https://decodingml.substack.com/p/using-this-python-package-you-can?r=1ttoeh
25 |         - https://decodingml.substack.com/p/the-4-advanced-rag-algorithms-you?r=1ttoeh
26 |         - https://decodingml.substack.com/p/problems-deploying-your-ml-models?r=1ttoeh
27 |         - https://decodingml.substack.com/p/sota-python-streaming-pipelines-for?r=1ttoeh
28 |         - https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh
29 |         - https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh
30 |         - https://decodingml.substack.com/p/my-ml-monthly-learning-resource-recommendations?r=1ttoeh
31 |         - https://decodingml.substack.com/p/an-end-to-end-framework-for-production?r=1ttoeh
32 |         - https://decodingml.substack.com/p/upskill-your-llm-knowledge-base-with?r=1ttoeh
33 |         - https://decodingml.substack.com/p/want-to-learn-an-end-to-end-framework?r=1ttoeh
34 |         - https://decodingml.substack.com/p/my-favorite-way-to-implement-a-configuration?r=1ttoeh
35 |         - https://decodingml.substack.com/p/a-real-time-retrieval-system-for?r=1ttoeh
36 |         - https://decodingml.substack.com/p/4-key-decoding-strategies-for-llms?r=1ttoeh
37 |         - https://decodingml.substack.com/p/dml-new-year-the-new-and-improved?r=1ttoeh
38 |         - https://decodingml.substack.com/p/dml-8-types-of-mlops-tools-that-must?r=1ttoeh
39 |         - https://decodingml.substack.com/p/dml-this-is-what-you-need-to-build?r=1ttoeh
40 |         - https://decodingml.substack.com/p/dml-7-steps-on-how-to-fine-tune-an?r=1ttoeh
41 |         - https://decodingml.substack.com/p/dml-how-do-you-generate-a-q-and-a?r=1ttoeh
42 |         - https://decodingml.substack.com/p/dml-what-do-you-need-to-fine-tune?r=1ttoeh
43 |         - https://decodingml.substack.com/p/dml-why-and-when-do-you-need-to-fine?r=1ttoeh
44 |         - https://decodingml.substack.com/p/dml-how-to-implement-a-streaming?r=1ttoeh
45 |         - https://decodingml.substack.com/p/dml-why-and-what-do-you-need-a-streaming?r=1ttoeh
46 |         - https://decodingml.substack.com/p/dml-unwrapping-the-3-pipeline-design?r=1ttoeh
47 |         - https://decodingml.substack.com/p/dml-how-to-design-an-llm-system-for?r=1ttoeh
48 |         - https://decodingml.substack.com/p/dml-synced-vector-dbs-a-guide-to?r=1ttoeh
49 |         - https://decodingml.substack.com/p/dml-what-is-the-difference-between?r=1ttoeh
50 |         - https://decodingml.substack.com/p/dml-7-steps-to-build-a-production?r=1ttoeh
51 |         - https://decodingml.substack.com/p/dml-chain-of-thought-reasoning-write?r=1ttoeh
52 |         - https://decodingml.substack.com/p/dml-build-and-serve-a-production?r=1ttoeh
53 |         - https://decodingml.substack.com/p/dml-4-key-ideas-you-must-know-to?r=1ttoeh
54 |         - https://decodingml.substack.com/p/dml-how-to-add-real-time-monitoring?r=1ttoeh
55 |         - https://decodingml.substack.com/p/dml-top-6-ml-platform-features-you?r=1ttoeh
56 |     - user_full_name: Maxime Labonne # [First Name(s)] [Last Name]
57 |       links:
58 |         # Substack
59 |         - https://maximelabonne.substack.com/p/uncensor-any-llm-with-abliteration-d30148b7d43e
60 |         - https://maximelabonne.substack.com/p/create-mixtures-of-experts-with-mergekit-11b318c99562
61 |         - https://maximelabonne.substack.com/p/merge-large-language-models-with-mergekit-2118fb392b54
62 |         - https://maximelabonne.substack.com/p/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac
63 |         - https://maximelabonne.substack.com/p/exllamav2-the-fastest-library-to-run-llms-32aeda294d26
64 |         - https://maximelabonne.substack.com/p/quantize-llama-models-with-ggml-and-llama-cpp-3612dfbcc172
65 |         - https://maximelabonne.substack.com/p/a-beginners-guide-to-llm-fine-tuning-4bae7d4da672
66 |         - https://maximelabonne.substack.com/p/graph-convolutional-networks-introduction-to-gnns-24b3f60d6c95
67 |         - https://maximelabonne.substack.com/p/4-bit-quantization-with-gptq-36b0f4f02c34
68 |         - https://maximelabonne.substack.com/p/fine-tune-your-own-llama-2-model-in-a-colab-notebook-df9823a04a32
69 |         - https://maximelabonne.substack.com/p/introduction-to-weight-quantization-2494701b9c0c
70 |         - https://maximelabonne.substack.com/p/decoding-strategies-in-large-language-models-9733a8f70539
71 |         - https://maximelabonne.substack.com/p/the-art-of-spending-optimizing-your-marketing-budget-with-nonlinear-optimization-6c8a39afb3c2
72 |         - https://maximelabonne.substack.com/p/create-a-bot-to-find-diamonds-in-minecraft-d836606a993a
73 |         - https://maximelabonne.substack.com/p/constraint-programming-67ac16fa0c81
74 |         - https://maximelabonne.substack.com/p/how-to-design-the-most-powerful-graph-neural-network-3d18b07a6e66
75 |         - https://maximelabonne.substack.com/p/introduction-to-graphsage-in-python-a9e7f9ecf9d7
76 |         - https://maximelabonne.substack.com/p/graph-attention-networks-in-python-975736ac5c0c
77 |         - https://maximelabonne.substack.com/p/integer-programming-vs-linear-programming-in-python-f1be5bb4e60e
78 |         - https://maximelabonne.substack.com/p/introduction-to-linear-programming-in-python-9261e7eb44b
79 |         - https://maximelabonne.substack.com/p/what-is-a-tensor-in-deep-learning-6dedd95d6507
80 |         - https://maximelabonne.substack.com/p/efficiently-iterating-over-rows-in-a-pandas-dataframe-7dd5f9992c01
81 |         - https://maximelabonne.substack.com/p/q-learning-for-beginners-2837b777741
82 |         - https://maximelabonne.substack.com/p/how-to-start-machine-learning-for-developers-in-2022-390af12b193f
83 |   # Generate instruct dataset pipeline parameters
84 |   test_split_size: 0.1
85 |   push_to_huggingface: false
86 |   dataset_id: pauliusztin/llmtwin
87 |   mock: false


--------------------------------------------------------------------------------
/configs/evaluating.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   docker:
 3 |     parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
 4 |     skip_build: True
 5 |   orchestrator.sagemaker:
 6 |     synchronous: false
 7 | 
 8 | parameters:
 9 |   is_dummy: true # Change this to 'false' to run the evaluation on the full dataset.
10 | 


--------------------------------------------------------------------------------
/configs/export_artifact_to_json.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   docker:
 3 |     parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
 4 |     skip_build: True
 5 |   orchestrator.sagemaker:
 6 |     synchronous: false
 7 | 
 8 | parameters:
 9 |   artifact_names:
10 |     - raw_documents
11 |     - cleaned_documents
12 |     - instruct_datasets
13 |     - preference_datasets
14 | 


--------------------------------------------------------------------------------
/configs/feature_engineering.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   docker:
 3 |     parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
 4 |     skip_build: True
 5 |   orchestrator.sagemaker:
 6 |     synchronous: false
 7 |     
 8 | parameters:
 9 |   author_full_names:
10 |     - Maxime Labonne
11 |     - Paul Iusztin
12 | 


--------------------------------------------------------------------------------
/configs/generate_instruct_datasets.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   docker:
 3 |     parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
 4 |     skip_build: True
 5 |   orchestrator.sagemaker:
 6 |     synchronous: false
 7 | 
 8 | parameters:
 9 |   test_split_size: 0.1
10 |   dataset_type: "instruction"
11 |   push_to_huggingface: true
12 |   dataset_id: pauliusztin/llmtwin
13 |   mock: false
14 | 


--------------------------------------------------------------------------------
/configs/generate_preference_datasets.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   docker:
 3 |     parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
 4 |     skip_build: True
 5 |   orchestrator.sagemaker:
 6 |     synchronous: false
 7 | 
 8 | parameters:
 9 |   test_split_size: 0.05
10 |   dataset_type: "preference"
11 |   push_to_huggingface: true
12 |   dataset_id: pauliusztin/llmtwin-dpo
13 |   mock: false
14 | 


--------------------------------------------------------------------------------
/configs/training.yaml:
--------------------------------------------------------------------------------
 1 | settings:
 2 |   docker:
 3 |     parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
 4 |     skip_build: True
 5 |   orchestrator.sagemaker:
 6 |     synchronous: false
 7 | 
 8 | parameters:
 9 |   finetuning_type: sft
10 |   num_train_epochs: 3
11 |   per_device_train_batch_size: 2
12 |   learning_rate: 3e-4
13 |   dataset_huggingface_workspace: mlabonne
14 |   is_dummy: true # Change this to 'false' to run the training with the full dataset and epochs.
15 | 


--------------------------------------------------------------------------------
/data/data_warehouse_raw_data/PostDocument.json:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/data/data_warehouse_raw_data/RepositoryDocument.json:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/data/data_warehouse_raw_data/UserDocument.json:
--------------------------------------------------------------------------------
1 | [{"first_name": "Maxime", "last_name": "Labonne", "_id": "eff74089-0271-4319-8543-745c087f4f61"}, {"first_name": "Paul", "last_name": "Iusztin", "_id": "b5fa1f08-75f0-402d-8e88-d1357e346d9e"}]


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   mongo:
 3 |     image: mongo:latest
 4 |     container_name: "llm_engineering_mongo"
 5 |     logging:
 6 |       options:
 7 |         max-size: 1g
 8 |     environment:
 9 |       MONGO_INITDB_ROOT_USERNAME: "llm_engineering"
10 |       MONGO_INITDB_ROOT_PASSWORD: "llm_engineering"
11 |     ports:
12 |       - 27017:27017
13 |     volumes:
14 |       - mongo_data:/data/db
15 |     networks:
16 |       - local
17 |     restart: always
18 | 
19 |   qdrant:
20 |     image: qdrant/qdrant:latest
21 |     container_name: "llm_engineering_qdrant"
22 |     ports:
23 |       - 6333:6333
24 |       - 6334:6334
25 |     expose:
26 |       - 6333
27 |       - 6334
28 |     volumes:
29 |       - qdrant_data:/qdrant/storage
30 |     networks:
31 |       - local
32 |     restart: always
33 | 
34 | volumes:
35 |   mongo_data:
36 |   qdrant_data:
37 | 
38 | networks:
39 |   local:
40 |     driver: bridge


--------------------------------------------------------------------------------
/images/cover_plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/images/cover_plus.png


--------------------------------------------------------------------------------
/images/crazy_cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/images/crazy_cat.jpg


--------------------------------------------------------------------------------
/llm_engineering/__init__.py:
--------------------------------------------------------------------------------
1 | from llm_engineering import application, domain, infrastructure
2 | from llm_engineering.settings import settings
3 | 
4 | __all__ = ["settings", "application", "domain", "infrastructure"]
5 | 


--------------------------------------------------------------------------------
/llm_engineering/application/__init__.py:
--------------------------------------------------------------------------------
1 | from . import utils
2 | 
3 | __all__ = ["utils"]
4 | 


--------------------------------------------------------------------------------
/llm_engineering/application/crawlers/__init__.py:
--------------------------------------------------------------------------------
1 | from .dispatcher import CrawlerDispatcher
2 | from .github import GithubCrawler
3 | from .linkedin import LinkedInCrawler
4 | from .medium import MediumCrawler
5 | 
6 | __all__ = ["CrawlerDispatcher", "GithubCrawler", "LinkedInCrawler", "MediumCrawler"]
7 | 


--------------------------------------------------------------------------------
/llm_engineering/application/crawlers/base.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from abc import ABC, abstractmethod
 3 | from tempfile import mkdtemp
 4 | 
 5 | import chromedriver_autoinstaller
 6 | from selenium import webdriver
 7 | from selenium.webdriver.chrome.options import Options
 8 | 
 9 | from llm_engineering.domain.documents import NoSQLBaseDocument
10 | 
11 | # Check if the current version of chromedriver exists
12 | # and if it doesn't exist, download it automatically,
13 | # then add chromedriver to path
14 | chromedriver_autoinstaller.install()
15 | 
16 | 
17 | class BaseCrawler(ABC):
18 |     model: type[NoSQLBaseDocument]
19 | 
20 |     @abstractmethod
21 |     def extract(self, link: str, **kwargs) -> None: ...
22 | 
23 | 
24 | class BaseSeleniumCrawler(BaseCrawler, ABC):
25 |     def __init__(self, scroll_limit: int = 5) -> None:
26 |         options = webdriver.ChromeOptions()
27 | 
28 |         options.add_argument("--no-sandbox")
29 |         options.add_argument("--headless=new")
30 |         options.add_argument("--disable-dev-shm-usage")
31 |         options.add_argument("--log-level=3")
32 |         options.add_argument("--disable-popup-blocking")
33 |         options.add_argument("--disable-notifications")
34 |         options.add_argument("--disable-extensions")
35 |         options.add_argument("--disable-background-networking")
36 |         options.add_argument("--ignore-certificate-errors")
37 |         options.add_argument(f"--user-data-dir={mkdtemp()}")
38 |         options.add_argument(f"--data-path={mkdtemp()}")
39 |         options.add_argument(f"--disk-cache-dir={mkdtemp()}")
40 |         options.add_argument("--remote-debugging-port=9226")
41 | 
42 |         self.set_extra_driver_options(options)
43 | 
44 |         self.scroll_limit = scroll_limit
45 |         self.driver = webdriver.Chrome(
46 |             options=options,
47 |         )
48 | 
49 |     def set_extra_driver_options(self, options: Options) -> None:
50 |         pass
51 | 
52 |     def login(self) -> None:
53 |         pass
54 | 
55 |     def scroll_page(self) -> None:
56 |         """Scroll through the LinkedIn page based on the scroll limit."""
57 |         current_scroll = 0
58 |         last_height = self.driver.execute_script("return document.body.scrollHeight")
59 |         while True:
60 |             self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
61 |             time.sleep(5)
62 |             new_height = self.driver.execute_script("return document.body.scrollHeight")
63 |             if new_height == last_height or (self.scroll_limit and current_scroll >= self.scroll_limit):
64 |                 break
65 |             last_height = new_height
66 |             current_scroll += 1
67 | 


--------------------------------------------------------------------------------
/llm_engineering/application/crawlers/custom_article.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse
 2 | 
 3 | from langchain_community.document_loaders import AsyncHtmlLoader
 4 | from langchain_community.document_transformers.html2text import Html2TextTransformer
 5 | from loguru import logger
 6 | 
 7 | from llm_engineering.domain.documents import ArticleDocument
 8 | 
 9 | from .base import BaseCrawler
10 | 
11 | 
12 | class CustomArticleCrawler(BaseCrawler):
13 |     model = ArticleDocument
14 | 
15 |     def __init__(self) -> None:
16 |         super().__init__()
17 | 
18 |     def extract(self, link: str, **kwargs) -> None:
19 |         old_model = self.model.find(link=link)
20 |         if old_model is not None:
21 |             logger.info(f"Article already exists in the database: {link}")
22 | 
23 |             return
24 | 
25 |         logger.info(f"Starting scrapping article: {link}")
26 | 
27 |         loader = AsyncHtmlLoader([link])
28 |         docs = loader.load()
29 | 
30 |         html2text = Html2TextTransformer()
31 |         docs_transformed = html2text.transform_documents(docs)
32 |         doc_transformed = docs_transformed[0]
33 | 
34 |         content = {
35 |             "Title": doc_transformed.metadata.get("title"),
36 |             "Subtitle": doc_transformed.metadata.get("description"),
37 |             "Content": doc_transformed.page_content,
38 |             "language": doc_transformed.metadata.get("language"),
39 |         }
40 | 
41 |         parsed_url = urlparse(link)
42 |         platform = parsed_url.netloc
43 | 
44 |         user = kwargs["user"]
45 |         instance = self.model(
46 |             content=content,
47 |             link=link,
48 |             platform=platform,
49 |             author_id=user.id,
50 |             author_full_name=user.full_name,
51 |         )
52 |         instance.save()
53 | 
54 |         logger.info(f"Finished scrapping custom article: {link}")
55 | 


--------------------------------------------------------------------------------
/llm_engineering/application/crawlers/dispatcher.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from urllib.parse import urlparse
 3 | 
 4 | from loguru import logger
 5 | 
 6 | from .base import BaseCrawler
 7 | from .custom_article import CustomArticleCrawler
 8 | from .github import GithubCrawler
 9 | from .linkedin import LinkedInCrawler
10 | from .medium import MediumCrawler
11 | 
12 | 
13 | class CrawlerDispatcher:
14 |     def __init__(self) -> None:
15 |         self._crawlers = {}
16 | 
17 |     @classmethod
18 |     def build(cls) -> "CrawlerDispatcher":
19 |         dispatcher = cls()
20 | 
21 |         return dispatcher
22 | 
23 |     def register_medium(self) -> "CrawlerDispatcher":
24 |         self.register("https://medium.com", MediumCrawler)
25 | 
26 |         return self
27 | 
28 |     def register_linkedin(self) -> "CrawlerDispatcher":
29 |         self.register("https://linkedin.com", LinkedInCrawler)
30 | 
31 |         return self
32 | 
33 |     def register_github(self) -> "CrawlerDispatcher":
34 |         self.register("https://github.com", GithubCrawler)
35 | 
36 |         return self
37 | 
38 |     def register(self, domain: str, crawler: type[BaseCrawler]) -> None:
39 |         parsed_domain = urlparse(domain)
40 |         domain = parsed_domain.netloc
41 | 
42 |         self._crawlers[r"https://(www\.)?{}/*".format(re.escape(domain))] = crawler
43 | 
44 |     def get_crawler(self, url: str) -> BaseCrawler:
45 |         for pattern, crawler in self._crawlers.items():
46 |             if re.match(pattern, url):
47 |                 return crawler()
48 |         else:
49 |             logger.warning(f"No crawler found for {url}. Defaulting to CustomArticleCrawler.")
50 | 
51 |             return CustomArticleCrawler()
52 | 


--------------------------------------------------------------------------------
/llm_engineering/application/crawlers/github.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import subprocess
 4 | import tempfile
 5 | 
 6 | from loguru import logger
 7 | 
 8 | from llm_engineering.domain.documents import RepositoryDocument
 9 | 
10 | from .base import BaseCrawler
11 | 
12 | 
13 | class GithubCrawler(BaseCrawler):
14 |     model = RepositoryDocument
15 | 
16 |     def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
17 |         super().__init__()
18 |         self._ignore = ignore
19 | 
20 |     def extract(self, link: str, **kwargs) -> None:
21 |         old_model = self.model.find(link=link)
22 |         if old_model is not None:
23 |             logger.info(f"Repository already exists in the database: {link}")
24 | 
25 |             return
26 | 
27 |         logger.info(f"Starting scrapping GitHub repository: {link}")
28 | 
29 |         repo_name = link.rstrip("/").split("/")[-1]
30 | 
31 |         local_temp = tempfile.mkdtemp()
32 | 
33 |         try:
34 |             os.chdir(local_temp)
35 |             subprocess.run(["git", "clone", link])
36 | 
37 |             repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])  # noqa: PTH118
38 | 
39 |             tree = {}
40 |             for root, _, files in os.walk(repo_path):
41 |                 dir = root.replace(repo_path, "").lstrip("/")
42 |                 if dir.startswith(self._ignore):
43 |                     continue
44 | 
45 |                 for file in files:
46 |                     if file.endswith(self._ignore):
47 |                         continue
48 |                     file_path = os.path.join(dir, file)  # noqa: PTH118
49 |                     with open(os.path.join(root, file), "r", errors="ignore") as f:  # noqa: PTH123, PTH118
50 |                         tree[file_path] = f.read().replace(" ", "")
51 | 
52 |             user = kwargs["user"]
53 |             instance = self.model(
54 |                 content=tree,
55 |                 name=repo_name,
56 |                 link=link,
57 |                 platform="github",
58 |                 author_id=user.id,
59 |                 author_full_name=user.full_name,
60 |             )
61 |             instance.save()
62 | 
63 |         except Exception:
64 |             raise
65 |         finally:
66 |             shutil.rmtree(local_temp)
67 | 
68 |         logger.info(f"Finished scrapping GitHub repository: {link}")
69 | 


--------------------------------------------------------------------------------
/llm_engineering/application/crawlers/linkedin.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import Dict, List
  3 | 
  4 | from bs4 import BeautifulSoup
  5 | from bs4.element import Tag
  6 | from loguru import logger
  7 | from selenium.webdriver.common.by import By
  8 | 
  9 | from llm_engineering.domain.documents import PostDocument
 10 | from llm_engineering.domain.exceptions import ImproperlyConfigured
 11 | from llm_engineering.settings import settings
 12 | 
 13 | from .base import BaseSeleniumCrawler
 14 | 
 15 | 
 16 | class LinkedInCrawler(BaseSeleniumCrawler):
 17 |     model = PostDocument
 18 | 
 19 |     def __init__(self, scroll_limit: int = 5, is_deprecated: bool = True) -> None:
 20 |         super().__init__(scroll_limit)
 21 | 
 22 |         self._is_deprecated = is_deprecated
 23 | 
 24 |     def set_extra_driver_options(self, options) -> None:
 25 |         options.add_experimental_option("detach", True)
 26 | 
 27 |     def login(self) -> None:
 28 |         if self._is_deprecated:
 29 |             raise DeprecationWarning(
 30 |                 "As LinkedIn has updated its security measures, the login() method is no longer supported."
 31 |             )
 32 | 
 33 |         self.driver.get("https://www.linkedin.com/login")
 34 |         if not settings.LINKEDIN_USERNAME or not settings.LINKEDIN_PASSWORD:
 35 |             raise ImproperlyConfigured(
 36 |                 "LinkedIn scraper requires the {LINKEDIN_USERNAME} and {LINKEDIN_PASSWORD} settings."
 37 |             )
 38 | 
 39 |         self.driver.find_element(By.ID, "username").send_keys(settings.LINKEDIN_USERNAME)
 40 |         self.driver.find_element(By.ID, "password").send_keys(settings.LINKEDIN_PASSWORD)
 41 |         self.driver.find_element(By.CSS_SELECTOR, ".login__form_action_container button").click()
 42 | 
 43 |     def extract(self, link: str, **kwargs) -> None:
 44 |         if self._is_deprecated:
 45 |             raise DeprecationWarning(
 46 |                 "As LinkedIn has updated its feed structure, the extract() method is no longer supported."
 47 |             )
 48 | 
 49 |         if self.model.link is not None:
 50 |             old_model = self.model.find(link=link)
 51 |             if old_model is not None:
 52 |                 logger.info(f"Post already exists in the database: {link}")
 53 | 
 54 |                 return
 55 | 
 56 |         logger.info(f"Starting scrapping data for profile: {link}")
 57 | 
 58 |         self.login()
 59 | 
 60 |         soup = self._get_page_content(link)
 61 | 
 62 |         data = {  # noqa
 63 |             "Name": self._scrape_section(soup, "h1", class_="text-heading-xlarge"),
 64 |             "About": self._scrape_section(soup, "div", class_="display-flex ph5 pv3"),
 65 |             "Main Page": self._scrape_section(soup, "div", {"id": "main-content"}),
 66 |             "Experience": self._scrape_experience(link),
 67 |             "Education": self._scrape_education(link),
 68 |         }
 69 | 
 70 |         self.driver.get(link)
 71 |         time.sleep(5)
 72 |         button = self.driver.find_element(
 73 |             By.CSS_SELECTOR, ".app-aware-link.profile-creator-shared-content-view__footer-action"
 74 |         )
 75 |         button.click()
 76 | 
 77 |         # Scrolling and scraping posts
 78 |         self.scroll_page()
 79 |         soup = BeautifulSoup(self.driver.page_source, "html.parser")
 80 |         post_elements = soup.find_all(
 81 |             "div",
 82 |             class_="update-components-text relative update-components-update-v2__commentary",
 83 |         )
 84 |         buttons = soup.find_all("button", class_="update-components-image__image-link")
 85 |         post_images = self._extract_image_urls(buttons)
 86 | 
 87 |         posts = self._extract_posts(post_elements, post_images)
 88 |         logger.info(f"Found {len(posts)} posts for profile: {link}")
 89 | 
 90 |         self.driver.close()
 91 | 
 92 |         user = kwargs["user"]
 93 |         self.model.bulk_insert(
 94 |             [
 95 |                 PostDocument(platform="linkedin", content=post, author_id=user.id, author_full_name=user.full_name)
 96 |                 for post in posts
 97 |             ]
 98 |         )
 99 | 
100 |         logger.info(f"Finished scrapping data for profile: {link}")
101 | 
102 |     def _scrape_section(self, soup: BeautifulSoup, *args, **kwargs) -> str:
103 |         """Scrape a specific section of the LinkedIn profile."""
104 |         # Example: Scrape the 'About' section
105 | 
106 |         parent_div = soup.find(*args, **kwargs)
107 | 
108 |         return parent_div.get_text(strip=True) if parent_div else ""
109 | 
110 |     def _extract_image_urls(self, buttons: List[Tag]) -> Dict[str, str]:
111 |         """
112 |         Extracts image URLs from button elements.
113 | 
114 |         Args:
115 |             buttons (List[Tag]): A list of BeautifulSoup Tag objects representing buttons.
116 | 
117 |         Returns:
118 |             Dict[str, str]: A dictionary mapping post indexes to image URLs.
119 |         """
120 | 
121 |         post_images = {}
122 |         for i, button in enumerate(buttons):
123 |             img_tag = button.find("img")
124 |             if img_tag and "src" in img_tag.attrs:
125 |                 post_images[f"Post_{i}"] = img_tag["src"]
126 |             else:
127 |                 logger.warning("No image found in this button")
128 |         return post_images
129 | 
130 |     def _get_page_content(self, url: str) -> BeautifulSoup:
131 |         """Retrieve the page content of a given URL."""
132 | 
133 |         self.driver.get(url)
134 |         time.sleep(5)
135 | 
136 |         return BeautifulSoup(self.driver.page_source, "html.parser")
137 | 
138 |     def _extract_posts(self, post_elements: List[Tag], post_images: Dict[str, str]) -> Dict[str, Dict[str, str]]:
139 |         """
140 |         Extracts post texts and combines them with their respective images.
141 | 
142 |         Args:
143 |             post_elements (List[Tag]): A list of BeautifulSoup Tag objects representing post elements.
144 |             post_images (Dict[str, str]): A dictionary containing image URLs mapped by post index.
145 | 
146 |         Returns:
147 |             Dict[str, Dict[str, str]]: A dictionary containing post data with text and optional image URL.
148 |         """
149 | 
150 |         posts_data = {}
151 |         for i, post_element in enumerate(post_elements):
152 |             post_text = post_element.get_text(strip=True, separator="\n")
153 |             post_data = {"text": post_text}
154 |             if f"Post_{i}" in post_images:
155 |                 post_data["image"] = post_images[f"Post_{i}"]
156 |             posts_data[f"Post_{i}"] = post_data
157 | 
158 |         return posts_data
159 | 
160 |     def _scrape_experience(self, profile_url: str) -> str:
161 |         """Scrapes the Experience section of the LinkedIn profile."""
162 | 
163 |         self.driver.get(profile_url + "/details/experience/")
164 |         time.sleep(5)
165 |         soup = BeautifulSoup(self.driver.page_source, "html.parser")
166 |         experience_content = soup.find("section", {"id": "experience-section"})
167 | 
168 |         return experience_content.get_text(strip=True) if experience_content else ""
169 | 
170 |     def _scrape_education(self, profile_url: str) -> str:
171 |         self.driver.get(profile_url + "/details/education/")
172 |         time.sleep(5)
173 |         soup = BeautifulSoup(self.driver.page_source, "html.parser")
174 |         education_content = soup.find("section", {"id": "education-section"})
175 | 
176 |         return education_content.get_text(strip=True) if education_content else ""
177 | 


--------------------------------------------------------------------------------
/llm_engineering/application/crawlers/medium.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | from loguru import logger
 3 | 
 4 | from llm_engineering.domain.documents import ArticleDocument
 5 | 
 6 | from .base import BaseSeleniumCrawler
 7 | 
 8 | 
 9 | class MediumCrawler(BaseSeleniumCrawler):
10 |     model = ArticleDocument
11 | 
12 |     def set_extra_driver_options(self, options) -> None:
13 |         options.add_argument(r"--profile-directory=Profile 2")
14 | 
15 |     def extract(self, link: str, **kwargs) -> None:
16 |         old_model = self.model.find(link=link)
17 |         if old_model is not None:
18 |             logger.info(f"Article already exists in the database: {link}")
19 | 
20 |             return
21 | 
22 |         logger.info(f"Starting scrapping Medium article: {link}")
23 | 
24 |         self.driver.get(link)
25 |         self.scroll_page()
26 | 
27 |         soup = BeautifulSoup(self.driver.page_source, "html.parser")
28 |         title = soup.find_all("h1", class_="pw-post-title")
29 |         subtitle = soup.find_all("h2", class_="pw-subtitle-paragraph")
30 | 
31 |         data = {
32 |             "Title": title[0].string if title else None,
33 |             "Subtitle": subtitle[0].string if subtitle else None,
34 |             "Content": soup.get_text(),
35 |         }
36 | 
37 |         self.driver.close()
38 | 
39 |         user = kwargs["user"]
40 |         instance = self.model(
41 |             platform="medium",
42 |             content=data,
43 |             link=link,
44 |             author_id=user.id,
45 |             author_full_name=user.full_name,
46 |         )
47 |         instance.save()
48 | 
49 |         logger.info(f"Successfully scraped and saved article: {link}")
50 | 


--------------------------------------------------------------------------------
/llm_engineering/application/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from . import generation
2 | 
3 | __all__ = ["generation"]
4 | 


--------------------------------------------------------------------------------
/llm_engineering/application/dataset/constants.py:
--------------------------------------------------------------------------------
 1 | from llm_engineering.domain.dataset import DatasetType
 2 | 
 3 | MOCKED_RESPONSE_INSTRUCT = """
 4 | [
 5 |     {"instruction": "<mocked generated instruction> 1", "answer": "<mocked generated answer> 1"},
 6 |     {"instruction": "<mocked generated instruction> 2", "answer": "<mocked generated answer> 2"},
 7 |     {"instruction": "<mocked generated instruction> 3", "answer": "<mocked generated answer> 3"}
 8 | ]
 9 | """
10 | 
11 | MOCKED_RESPONSE_PREFERENCE = """
12 | [
13 |     {"instruction": "<mocked generated instruction> 1", "rejected": "<mocked generated answer> 1", "chosen": "Mocked extracted extracted extracted extracted extracted extracted extracted extracted extracted extracted answer 1."},
14 |     {"instruction": "<mocked generated instruction> 2", "rejected": "<mocked generated answer> 2", "chosen": "Mocked extracted extracted extracted extracted extracted extracted extracted extracted extracted extracted answer 2."},
15 |     {"instruction": "<mocked generated instruction> 3", "rejected": "<mocked generated answer> 3", "chosen": "Mocked extracted answer 3"}
16 | ]
17 | """
18 | 
19 | 
20 | def get_mocked_response(dataset_type: DatasetType) -> str:
21 |     if dataset_type == DatasetType.INSTRUCTION:
22 |         return MOCKED_RESPONSE_INSTRUCT
23 |     elif dataset_type == DatasetType.PREFERENCE:
24 |         return MOCKED_RESPONSE_PREFERENCE
25 |     else:
26 |         raise ValueError(f"Invalid dataset type: {dataset_type}")
27 | 


--------------------------------------------------------------------------------
/llm_engineering/application/dataset/output_parsers.py:
--------------------------------------------------------------------------------
 1 | from langchain.output_parsers import PydanticOutputParser
 2 | 
 3 | 
 4 | class ListPydanticOutputParser(PydanticOutputParser):
 5 |     def _parse_obj(self, obj: dict | list):
 6 |         if isinstance(obj, list):
 7 |             return [super(ListPydanticOutputParser, self)._parse_obj(obj_) for obj_ in obj]
 8 |         else:
 9 |             return super(ListPydanticOutputParser, self)._parse_obj(obj)
10 | 


--------------------------------------------------------------------------------
/llm_engineering/application/dataset/utils.py:
--------------------------------------------------------------------------------
  1 | from sklearn.model_selection import train_test_split
  2 | 
  3 | from llm_engineering.application.preprocessing.operations.chunking import chunk_document
  4 | from llm_engineering.domain.cleaned_documents import CleanedDocument
  5 | from llm_engineering.domain.dataset import (
  6 |     InstructDataset,
  7 |     InstructDatasetSample,
  8 |     InstructTrainTestSplit,
  9 |     PreferenceDataset,
 10 |     PreferenceDatasetSample,
 11 |     PreferenceTrainTestSplit,
 12 | )
 13 | from llm_engineering.domain.types import DataCategory
 14 | 
 15 | 
 16 | def create_instruct_train_test_split(
 17 |     data: dict[DataCategory, InstructDataset], test_size=0.2, random_state=42
 18 | ) -> InstructTrainTestSplit:
 19 |     train_data = {}
 20 |     test_data = {}
 21 | 
 22 |     for category, dataset in data.items():
 23 |         samples = dataset.samples
 24 |         samples_dicts = [sample.model_dump() for sample in samples]
 25 | 
 26 |         if len(samples_dicts) > 0:
 27 |             train_samples_dicts, test_samples_dicts = train_test_split(
 28 |                 samples_dicts, test_size=test_size, random_state=random_state
 29 |             )
 30 |             train_samples = [InstructDatasetSample(**sample_dict) for sample_dict in train_samples_dicts]
 31 |             test_samples = [InstructDatasetSample(**sample_dict) for sample_dict in test_samples_dicts]
 32 |         else:
 33 |             train_samples = []
 34 |             test_samples = []
 35 | 
 36 |         train_dataset = InstructDataset(category=category, samples=train_samples)
 37 |         test_dataset = InstructDataset(category=category, samples=test_samples)
 38 | 
 39 |         train_data[category] = train_dataset
 40 |         test_data[category] = test_dataset
 41 | 
 42 |     return InstructTrainTestSplit(train=train_data, test=test_data, test_split_size=test_size)
 43 | 
 44 | 
 45 | def create_preference_train_test_split(
 46 |     data: dict[DataCategory, PreferenceDataset], test_size=0.2, random_state=42
 47 | ) -> PreferenceTrainTestSplit:
 48 |     train_data = {}
 49 |     test_data = {}
 50 | 
 51 |     for category, dataset in data.items():
 52 |         samples = dataset.samples
 53 |         samples_dicts = [sample.model_dump() for sample in samples]
 54 | 
 55 |         if len(samples_dicts) > 0:
 56 |             train_samples_dicts, test_samples_dicts = train_test_split(
 57 |                 samples_dicts, test_size=test_size, random_state=random_state
 58 |             )
 59 |             train_samples = [PreferenceDatasetSample(**sample_dict) for sample_dict in train_samples_dicts]
 60 |             test_samples = [PreferenceDatasetSample(**sample_dict) for sample_dict in test_samples_dicts]
 61 |         else:
 62 |             train_samples = []
 63 |             test_samples = []
 64 | 
 65 |         train_dataset = PreferenceDataset(category=category, samples=train_samples)
 66 |         test_dataset = PreferenceDataset(category=category, samples=test_samples)
 67 | 
 68 |         train_data[category] = train_dataset
 69 |         test_data[category] = test_dataset
 70 | 
 71 |     return PreferenceTrainTestSplit(train=train_data, test=test_data, test_split_size=test_size)
 72 | 
 73 | 
 74 | def filter_short_answers(
 75 |     data: dict[DataCategory, PreferenceDataset], min_length: int = 100
 76 | ) -> dict[DataCategory, PreferenceDataset]:
 77 |     def is_long_enough(example: PreferenceDatasetSample) -> bool:
 78 |         return len(example.chosen) >= min_length
 79 | 
 80 |     filtered_data = {}
 81 |     for category, dataset in data.items():
 82 |         filetered_dataset_samples = list(filter(is_long_enough, dataset.samples))
 83 |         filtered_dataset = PreferenceDataset(category=category, samples=filetered_dataset_samples)
 84 | 
 85 |         filtered_data[category] = filtered_dataset
 86 | 
 87 |     return filtered_data
 88 | 
 89 | 
 90 | def filter_answer_format(data: dict[DataCategory, PreferenceDataset]) -> dict[DataCategory, PreferenceDataset]:
 91 |     def is_valid_format(example: PreferenceDatasetSample) -> bool:
 92 |         chosen = example.chosen
 93 | 
 94 |         return len(chosen) > 0 and chosen[0].isupper() and chosen[-1] in (".", "!", "?")
 95 | 
 96 |     filtered_data = {}
 97 |     for category, dataset in data.items():
 98 |         filetered_dataset_samples = list(filter(is_valid_format, dataset.samples))
 99 |         filtered_dataset = PreferenceDataset(category=category, samples=filetered_dataset_samples)
100 | 
101 |         filtered_data[category] = filtered_dataset
102 | 
103 |     return filtered_data
104 | 
105 | 
106 | def extract_substrings(
107 |     documents: list[CleanedDocument], min_length: int = 1000, max_length: int = 2000
108 | ) -> list[CleanedDocument]:
109 |     extracts = []
110 |     for document in documents:
111 |         document_extracts = chunk_document(document.content, min_length, max_length)
112 |         for extract in document_extracts:
113 |             subdocument = document.model_copy()
114 |             subdocument.content = extract
115 | 
116 |             extracts.append(subdocument)
117 | 
118 |     return extracts
119 | 


--------------------------------------------------------------------------------
/llm_engineering/application/networks/__init__.py:
--------------------------------------------------------------------------------
1 | from .embeddings import CrossEncoderModelSingleton, EmbeddingModelSingleton
2 | 
3 | __all__ = ["EmbeddingModelSingleton", "CrossEncoderModelSingleton"]
4 | 


--------------------------------------------------------------------------------
/llm_engineering/application/networks/base.py:
--------------------------------------------------------------------------------
 1 | from threading import Lock
 2 | from typing import ClassVar
 3 | 
 4 | 
 5 | class SingletonMeta(type):
 6 |     """
 7 |     This is a thread-safe implementation of Singleton.
 8 |     """
 9 | 
10 |     _instances: ClassVar = {}
11 | 
12 |     _lock: Lock = Lock()
13 | 
14 |     """
15 |     We now have a lock object that will be used to synchronize threads during
16 |     first access to the Singleton.
17 |     """
18 | 
19 |     def __call__(cls, *args, **kwargs):
20 |         """
21 |         Possible changes to the value of the `__init__` argument do not affect
22 |         the returned instance.
23 |         """
24 |         # Now, imagine that the program has just been launched. Since there's no
25 |         # Singleton instance yet, multiple threads can simultaneously pass the
26 |         # previous conditional and reach this point almost at the same time. The
27 |         # first of them will acquire lock and will proceed further, while the
28 |         # rest will wait here.
29 |         with cls._lock:
30 |             # The first thread to acquire the lock, reaches this conditional,
31 |             # goes inside and creates the Singleton instance. Once it leaves the
32 |             # lock block, a thread that might have been waiting for the lock
33 |             # release may then enter this section. But since the Singleton field
34 |             # is already initialized, the thread won't create a new object.
35 |             if cls not in cls._instances:
36 |                 instance = super().__call__(*args, **kwargs)
37 |                 cls._instances[cls] = instance
38 | 
39 |         return cls._instances[cls]
40 | 


--------------------------------------------------------------------------------
/llm_engineering/application/networks/embeddings.py:
--------------------------------------------------------------------------------
  1 | from functools import cached_property
  2 | from pathlib import Path
  3 | from typing import Optional
  4 | 
  5 | import numpy as np
  6 | from loguru import logger
  7 | from numpy.typing import NDArray
  8 | from sentence_transformers.SentenceTransformer import SentenceTransformer
  9 | from sentence_transformers.cross_encoder import CrossEncoder
 10 | from transformers import AutoTokenizer
 11 | 
 12 | from llm_engineering.settings import settings
 13 | 
 14 | from .base import SingletonMeta
 15 | 
 16 | 
 17 | class EmbeddingModelSingleton(metaclass=SingletonMeta):
 18 |     """
 19 |     A singleton class that provides a pre-trained transformer model for generating embeddings of input text.
 20 |     """
 21 | 
 22 |     def __init__(
 23 |         self,
 24 |         model_id: str = settings.TEXT_EMBEDDING_MODEL_ID,
 25 |         device: str = settings.RAG_MODEL_DEVICE,
 26 |         cache_dir: Optional[Path] = None,
 27 |     ) -> None:
 28 |         self._model_id = model_id
 29 |         self._device = device
 30 | 
 31 |         self._model = SentenceTransformer(
 32 |             self._model_id,
 33 |             device=self._device,
 34 |             cache_folder=str(cache_dir) if cache_dir else None,
 35 |         )
 36 |         self._model.eval()
 37 | 
 38 |     @property
 39 |     def model_id(self) -> str:
 40 |         """
 41 |         Returns the identifier of the pre-trained transformer model to use.
 42 | 
 43 |         Returns:
 44 |             str: The identifier of the pre-trained transformer model to use.
 45 |         """
 46 | 
 47 |         return self._model_id
 48 | 
 49 |     @cached_property
 50 |     def embedding_size(self) -> int:
 51 |         """
 52 |         Returns the size of the embeddings generated by the pre-trained transformer model.
 53 | 
 54 |         Returns:
 55 |             int: The size of the embeddings generated by the pre-trained transformer model.
 56 |         """
 57 | 
 58 |         dummy_embedding = self._model.encode("")
 59 | 
 60 |         return dummy_embedding.shape[0]
 61 | 
 62 |     @property
 63 |     def max_input_length(self) -> int:
 64 |         """
 65 |         Returns the maximum length of input text to tokenize.
 66 | 
 67 |         Returns:
 68 |             int: The maximum length of input text to tokenize.
 69 |         """
 70 | 
 71 |         return self._model.max_seq_length
 72 | 
 73 |     @property
 74 |     def tokenizer(self) -> AutoTokenizer:
 75 |         """
 76 |         Returns the tokenizer used to tokenize input text.
 77 | 
 78 |         Returns:
 79 |             AutoTokenizer: The tokenizer used to tokenize input text.
 80 |         """
 81 | 
 82 |         return self._model.tokenizer
 83 | 
 84 |     def __call__(
 85 |         self, input_text: str | list[str], to_list: bool = True
 86 |     ) -> NDArray[np.float32] | list[float] | list[list[float]]:
 87 |         """
 88 |         Generates embeddings for the input text using the pre-trained transformer model.
 89 | 
 90 |         Args:
 91 |             input_text (str): The input text to generate embeddings for.
 92 |             to_list (bool): Whether to return the embeddings as a list or numpy array. Defaults to True.
 93 | 
 94 |         Returns:
 95 |             Union[np.ndarray, list]: The embeddings generated for the input text.
 96 |         """
 97 | 
 98 |         try:
 99 |             embeddings = self._model.encode(input_text)
100 |         except Exception:
101 |             logger.error(f"Error generating embeddings for {self._model_id=} and {input_text=}")
102 | 
103 |             return [] if to_list else np.array([])
104 | 
105 |         if to_list:
106 |             embeddings = embeddings.tolist()
107 | 
108 |         return embeddings
109 | 
110 | 
111 | class CrossEncoderModelSingleton(metaclass=SingletonMeta):
112 |     def __init__(
113 |         self,
114 |         model_id: str = settings.RERANKING_CROSS_ENCODER_MODEL_ID,
115 |         device: str = settings.RAG_MODEL_DEVICE,
116 |     ) -> None:
117 |         """
118 |         A singleton class that provides a pre-trained cross-encoder model for scoring pairs of input text.
119 |         """
120 | 
121 |         self._model_id = model_id
122 |         self._device = device
123 | 
124 |         self._model = CrossEncoder(
125 |             model_name=self._model_id,
126 |             device=self._device,
127 |         )
128 |         self._model.model.eval()
129 | 
130 |     def __call__(self, pairs: list[tuple[str, str]], to_list: bool = True) -> NDArray[np.float32] | list[float]:
131 |         scores = self._model.predict(pairs)
132 | 
133 |         if to_list:
134 |             scores = scores.tolist()
135 | 
136 |         return scores
137 | 


--------------------------------------------------------------------------------
/llm_engineering/application/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .dispatchers import ChunkingDispatcher, CleaningDispatcher, EmbeddingDispatcher
2 | 
3 | __all__ = ["CleaningDispatcher", "ChunkingDispatcher", "EmbeddingDispatcher"]
4 | 


--------------------------------------------------------------------------------
/llm_engineering/application/preprocessing/chunking_data_handlers.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | from abc import ABC, abstractmethod
  3 | from typing import Generic, TypeVar
  4 | from uuid import UUID
  5 | 
  6 | from llm_engineering.domain.chunks import ArticleChunk, Chunk, PostChunk, RepositoryChunk
  7 | from llm_engineering.domain.cleaned_documents import (
  8 |     CleanedArticleDocument,
  9 |     CleanedDocument,
 10 |     CleanedPostDocument,
 11 |     CleanedRepositoryDocument,
 12 | )
 13 | 
 14 | from .operations import chunk_article, chunk_text
 15 | 
 16 | CleanedDocumentT = TypeVar("CleanedDocumentT", bound=CleanedDocument)
 17 | ChunkT = TypeVar("ChunkT", bound=Chunk)
 18 | 
 19 | 
 20 | class ChunkingDataHandler(ABC, Generic[CleanedDocumentT, ChunkT]):
 21 |     """
 22 |     Abstract class for all Chunking data handlers.
 23 |     All data transformations logic for the chunking step is done here
 24 |     """
 25 | 
 26 |     @property
 27 |     def metadata(self) -> dict:
 28 |         return {
 29 |             "chunk_size": 500,
 30 |             "chunk_overlap": 50,
 31 |         }
 32 | 
 33 |     @abstractmethod
 34 |     def chunk(self, data_model: CleanedDocumentT) -> list[ChunkT]:
 35 |         pass
 36 | 
 37 | 
 38 | class PostChunkingHandler(ChunkingDataHandler):
 39 |     @property
 40 |     def metadata(self) -> dict:
 41 |         return {
 42 |             "chunk_size": 250,
 43 |             "chunk_overlap": 25,
 44 |         }
 45 | 
 46 |     def chunk(self, data_model: CleanedPostDocument) -> list[PostChunk]:
 47 |         data_models_list = []
 48 | 
 49 |         cleaned_content = data_model.content
 50 |         chunks = chunk_text(
 51 |             cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"]
 52 |         )
 53 | 
 54 |         for chunk in chunks:
 55 |             chunk_id = hashlib.md5(chunk.encode()).hexdigest()
 56 |             model = PostChunk(
 57 |                 id=UUID(chunk_id, version=4),
 58 |                 content=chunk,
 59 |                 platform=data_model.platform,
 60 |                 document_id=data_model.id,
 61 |                 author_id=data_model.author_id,
 62 |                 author_full_name=data_model.author_full_name,
 63 |                 image=data_model.image if data_model.image else None,
 64 |                 metadata=self.metadata,
 65 |             )
 66 |             data_models_list.append(model)
 67 | 
 68 |         return data_models_list
 69 | 
 70 | 
 71 | class ArticleChunkingHandler(ChunkingDataHandler):
 72 |     @property
 73 |     def metadata(self) -> dict:
 74 |         return {
 75 |             "min_length": 1000,
 76 |             "max_length": 2000,
 77 |         }
 78 | 
 79 |     def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]:
 80 |         data_models_list = []
 81 | 
 82 |         cleaned_content = data_model.content
 83 |         chunks = chunk_article(
 84 |             cleaned_content, min_length=self.metadata["min_length"], max_length=self.metadata["max_length"]
 85 |         )
 86 | 
 87 |         for chunk in chunks:
 88 |             chunk_id = hashlib.md5(chunk.encode()).hexdigest()
 89 |             model = ArticleChunk(
 90 |                 id=UUID(chunk_id, version=4),
 91 |                 content=chunk,
 92 |                 platform=data_model.platform,
 93 |                 link=data_model.link,
 94 |                 document_id=data_model.id,
 95 |                 author_id=data_model.author_id,
 96 |                 author_full_name=data_model.author_full_name,
 97 |                 metadata=self.metadata,
 98 |             )
 99 |             data_models_list.append(model)
100 | 
101 |         return data_models_list
102 | 
103 | 
104 | class RepositoryChunkingHandler(ChunkingDataHandler):
105 |     @property
106 |     def metadata(self) -> dict:
107 |         return {
108 |             "chunk_size": 1500,
109 |             "chunk_overlap": 100,
110 |         }
111 | 
112 |     def chunk(self, data_model: CleanedRepositoryDocument) -> list[RepositoryChunk]:
113 |         data_models_list = []
114 | 
115 |         cleaned_content = data_model.content
116 |         chunks = chunk_text(
117 |             cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"]
118 |         )
119 | 
120 |         for chunk in chunks:
121 |             chunk_id = hashlib.md5(chunk.encode()).hexdigest()
122 |             model = RepositoryChunk(
123 |                 id=UUID(chunk_id, version=4),
124 |                 content=chunk,
125 |                 platform=data_model.platform,
126 |                 name=data_model.name,
127 |                 link=data_model.link,
128 |                 document_id=data_model.id,
129 |                 author_id=data_model.author_id,
130 |                 author_full_name=data_model.author_full_name,
131 |                 metadata=self.metadata,
132 |             )
133 |             data_models_list.append(model)
134 | 
135 |         return data_models_list
136 | 


--------------------------------------------------------------------------------
/llm_engineering/application/preprocessing/cleaning_data_handlers.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Generic, TypeVar
 3 | 
 4 | from llm_engineering.domain.cleaned_documents import (
 5 |     CleanedArticleDocument,
 6 |     CleanedDocument,
 7 |     CleanedPostDocument,
 8 |     CleanedRepositoryDocument,
 9 | )
10 | from llm_engineering.domain.documents import (
11 |     ArticleDocument,
12 |     Document,
13 |     PostDocument,
14 |     RepositoryDocument,
15 | )
16 | 
17 | from .operations import clean_text
18 | 
19 | DocumentT = TypeVar("DocumentT", bound=Document)
20 | CleanedDocumentT = TypeVar("CleanedDocumentT", bound=CleanedDocument)
21 | 
22 | 
23 | class CleaningDataHandler(ABC, Generic[DocumentT, CleanedDocumentT]):
24 |     """
25 |     Abstract class for all cleaning data handlers.
26 |     All data transformations logic for the cleaning step is done here
27 |     """
28 | 
29 |     @abstractmethod
30 |     def clean(self, data_model: DocumentT) -> CleanedDocumentT:
31 |         pass
32 | 
33 | 
34 | class PostCleaningHandler(CleaningDataHandler):
35 |     def clean(self, data_model: PostDocument) -> CleanedPostDocument:
36 |         return CleanedPostDocument(
37 |             id=data_model.id,
38 |             content=clean_text(" #### ".join(data_model.content.values())),
39 |             platform=data_model.platform,
40 |             author_id=data_model.author_id,
41 |             author_full_name=data_model.author_full_name,
42 |             image=data_model.image if data_model.image else None,
43 |         )
44 | 
45 | 
46 | class ArticleCleaningHandler(CleaningDataHandler):
47 |     def clean(self, data_model: ArticleDocument) -> CleanedArticleDocument:
48 |         valid_content = [content for content in data_model.content.values() if content]
49 | 
50 |         return CleanedArticleDocument(
51 |             id=data_model.id,
52 |             content=clean_text(" #### ".join(valid_content)),
53 |             platform=data_model.platform,
54 |             link=data_model.link,
55 |             author_id=data_model.author_id,
56 |             author_full_name=data_model.author_full_name,
57 |         )
58 | 
59 | 
60 | class RepositoryCleaningHandler(CleaningDataHandler):
61 |     def clean(self, data_model: RepositoryDocument) -> CleanedRepositoryDocument:
62 |         return CleanedRepositoryDocument(
63 |             id=data_model.id,
64 |             content=clean_text(" #### ".join(data_model.content.values())),
65 |             platform=data_model.platform,
66 |             name=data_model.name,
67 |             link=data_model.link,
68 |             author_id=data_model.author_id,
69 |             author_full_name=data_model.author_full_name,
70 |         )
71 | 


--------------------------------------------------------------------------------
/llm_engineering/application/preprocessing/dispatchers.py:
--------------------------------------------------------------------------------
  1 | from loguru import logger
  2 | 
  3 | from llm_engineering.domain.base import NoSQLBaseDocument, VectorBaseDocument
  4 | from llm_engineering.domain.types import DataCategory
  5 | 
  6 | from .chunking_data_handlers import (
  7 |     ArticleChunkingHandler,
  8 |     ChunkingDataHandler,
  9 |     PostChunkingHandler,
 10 |     RepositoryChunkingHandler,
 11 | )
 12 | from .cleaning_data_handlers import (
 13 |     ArticleCleaningHandler,
 14 |     CleaningDataHandler,
 15 |     PostCleaningHandler,
 16 |     RepositoryCleaningHandler,
 17 | )
 18 | from .embedding_data_handlers import (
 19 |     ArticleEmbeddingHandler,
 20 |     EmbeddingDataHandler,
 21 |     PostEmbeddingHandler,
 22 |     QueryEmbeddingHandler,
 23 |     RepositoryEmbeddingHandler,
 24 | )
 25 | 
 26 | 
 27 | class CleaningHandlerFactory:
 28 |     @staticmethod
 29 |     def create_handler(data_category: DataCategory) -> CleaningDataHandler:
 30 |         if data_category == DataCategory.POSTS:
 31 |             return PostCleaningHandler()
 32 |         elif data_category == DataCategory.ARTICLES:
 33 |             return ArticleCleaningHandler()
 34 |         elif data_category == DataCategory.REPOSITORIES:
 35 |             return RepositoryCleaningHandler()
 36 |         else:
 37 |             raise ValueError("Unsupported data type")
 38 | 
 39 | 
 40 | class CleaningDispatcher:
 41 |     factory = CleaningHandlerFactory()
 42 | 
 43 |     @classmethod
 44 |     def dispatch(cls, data_model: NoSQLBaseDocument) -> VectorBaseDocument:
 45 |         data_category = DataCategory(data_model.get_collection_name())
 46 |         handler = cls.factory.create_handler(data_category)
 47 |         clean_model = handler.clean(data_model)
 48 | 
 49 |         logger.info(
 50 |             "Document cleaned successfully.",
 51 |             data_category=data_category,
 52 |             cleaned_content_len=len(clean_model.content),
 53 |         )
 54 | 
 55 |         return clean_model
 56 | 
 57 | 
 58 | class ChunkingHandlerFactory:
 59 |     @staticmethod
 60 |     def create_handler(data_category: DataCategory) -> ChunkingDataHandler:
 61 |         if data_category == DataCategory.POSTS:
 62 |             return PostChunkingHandler()
 63 |         elif data_category == DataCategory.ARTICLES:
 64 |             return ArticleChunkingHandler()
 65 |         elif data_category == DataCategory.REPOSITORIES:
 66 |             return RepositoryChunkingHandler()
 67 |         else:
 68 |             raise ValueError("Unsupported data type")
 69 | 
 70 | 
 71 | class ChunkingDispatcher:
 72 |     factory = ChunkingHandlerFactory
 73 | 
 74 |     @classmethod
 75 |     def dispatch(cls, data_model: VectorBaseDocument) -> list[VectorBaseDocument]:
 76 |         data_category = data_model.get_category()
 77 |         handler = cls.factory.create_handler(data_category)
 78 |         chunk_models = handler.chunk(data_model)
 79 | 
 80 |         logger.info(
 81 |             "Document chunked successfully.",
 82 |             num=len(chunk_models),
 83 |             data_category=data_category,
 84 |         )
 85 | 
 86 |         return chunk_models
 87 | 
 88 | 
 89 | class EmbeddingHandlerFactory:
 90 |     @staticmethod
 91 |     def create_handler(data_category: DataCategory) -> EmbeddingDataHandler:
 92 |         if data_category == DataCategory.QUERIES:
 93 |             return QueryEmbeddingHandler()
 94 |         if data_category == DataCategory.POSTS:
 95 |             return PostEmbeddingHandler()
 96 |         elif data_category == DataCategory.ARTICLES:
 97 |             return ArticleEmbeddingHandler()
 98 |         elif data_category == DataCategory.REPOSITORIES:
 99 |             return RepositoryEmbeddingHandler()
100 |         else:
101 |             raise ValueError("Unsupported data type")
102 | 
103 | 
104 | class EmbeddingDispatcher:
105 |     factory = EmbeddingHandlerFactory
106 | 
107 |     @classmethod
108 |     def dispatch(
109 |         cls, data_model: VectorBaseDocument | list[VectorBaseDocument]
110 |     ) -> VectorBaseDocument | list[VectorBaseDocument]:
111 |         is_list = isinstance(data_model, list)
112 |         if not is_list:
113 |             data_model = [data_model]
114 | 
115 |         if len(data_model) == 0:
116 |             return []
117 | 
118 |         data_category = data_model[0].get_category()
119 |         assert all(
120 |             data_model.get_category() == data_category for data_model in data_model
121 |         ), "Data models must be of the same category."
122 |         handler = cls.factory.create_handler(data_category)
123 | 
124 |         embedded_chunk_model = handler.embed_batch(data_model)
125 | 
126 |         if not is_list:
127 |             embedded_chunk_model = embedded_chunk_model[0]
128 | 
129 |         logger.info(
130 |             "Data embedded successfully.",
131 |             data_category=data_category,
132 |         )
133 | 
134 |         return embedded_chunk_model
135 | 


--------------------------------------------------------------------------------
/llm_engineering/application/preprocessing/embedding_data_handlers.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from typing import Generic, TypeVar, cast
  3 | 
  4 | from llm_engineering.application.networks import EmbeddingModelSingleton
  5 | from llm_engineering.domain.chunks import ArticleChunk, Chunk, PostChunk, RepositoryChunk
  6 | from llm_engineering.domain.embedded_chunks import (
  7 |     EmbeddedArticleChunk,
  8 |     EmbeddedChunk,
  9 |     EmbeddedPostChunk,
 10 |     EmbeddedRepositoryChunk,
 11 | )
 12 | from llm_engineering.domain.queries import EmbeddedQuery, Query
 13 | 
 14 | ChunkT = TypeVar("ChunkT", bound=Chunk)
 15 | EmbeddedChunkT = TypeVar("EmbeddedChunkT", bound=EmbeddedChunk)
 16 | 
 17 | embedding_model = EmbeddingModelSingleton()
 18 | 
 19 | 
 20 | class EmbeddingDataHandler(ABC, Generic[ChunkT, EmbeddedChunkT]):
 21 |     """
 22 |     Abstract class for all embedding data handlers.
 23 |     All data transformations logic for the embedding step is done here
 24 |     """
 25 | 
 26 |     def embed(self, data_model: ChunkT) -> EmbeddedChunkT:
 27 |         return self.embed_batch([data_model])[0]
 28 | 
 29 |     def embed_batch(self, data_model: list[ChunkT]) -> list[EmbeddedChunkT]:
 30 |         embedding_model_input = [data_model.content for data_model in data_model]
 31 |         embeddings = embedding_model(embedding_model_input, to_list=True)
 32 | 
 33 |         embedded_chunk = [
 34 |             self.map_model(data_model, cast(list[float], embedding))
 35 |             for data_model, embedding in zip(data_model, embeddings, strict=False)
 36 |         ]
 37 | 
 38 |         return embedded_chunk
 39 | 
 40 |     @abstractmethod
 41 |     def map_model(self, data_model: ChunkT, embedding: list[float]) -> EmbeddedChunkT:
 42 |         pass
 43 | 
 44 | 
 45 | class QueryEmbeddingHandler(EmbeddingDataHandler):
 46 |     def map_model(self, data_model: Query, embedding: list[float]) -> EmbeddedQuery:
 47 |         return EmbeddedQuery(
 48 |             id=data_model.id,
 49 |             author_id=data_model.author_id,
 50 |             author_full_name=data_model.author_full_name,
 51 |             content=data_model.content,
 52 |             embedding=embedding,
 53 |             metadata={
 54 |                 "embedding_model_id": embedding_model.model_id,
 55 |                 "embedding_size": embedding_model.embedding_size,
 56 |                 "max_input_length": embedding_model.max_input_length,
 57 |             },
 58 |         )
 59 | 
 60 | 
 61 | class PostEmbeddingHandler(EmbeddingDataHandler):
 62 |     def map_model(self, data_model: PostChunk, embedding: list[float]) -> EmbeddedPostChunk:
 63 |         return EmbeddedPostChunk(
 64 |             id=data_model.id,
 65 |             content=data_model.content,
 66 |             embedding=embedding,
 67 |             platform=data_model.platform,
 68 |             document_id=data_model.document_id,
 69 |             author_id=data_model.author_id,
 70 |             author_full_name=data_model.author_full_name,
 71 |             metadata={
 72 |                 "embedding_model_id": embedding_model.model_id,
 73 |                 "embedding_size": embedding_model.embedding_size,
 74 |                 "max_input_length": embedding_model.max_input_length,
 75 |             },
 76 |         )
 77 | 
 78 | 
 79 | class ArticleEmbeddingHandler(EmbeddingDataHandler):
 80 |     def map_model(self, data_model: ArticleChunk, embedding: list[float]) -> EmbeddedArticleChunk:
 81 |         return EmbeddedArticleChunk(
 82 |             id=data_model.id,
 83 |             content=data_model.content,
 84 |             embedding=embedding,
 85 |             platform=data_model.platform,
 86 |             link=data_model.link,
 87 |             document_id=data_model.document_id,
 88 |             author_id=data_model.author_id,
 89 |             author_full_name=data_model.author_full_name,
 90 |             metadata={
 91 |                 "embedding_model_id": embedding_model.model_id,
 92 |                 "embedding_size": embedding_model.embedding_size,
 93 |                 "max_input_length": embedding_model.max_input_length,
 94 |             },
 95 |         )
 96 | 
 97 | 
 98 | class RepositoryEmbeddingHandler(EmbeddingDataHandler):
 99 |     def map_model(self, data_model: RepositoryChunk, embedding: list[float]) -> EmbeddedRepositoryChunk:
100 |         return EmbeddedRepositoryChunk(
101 |             id=data_model.id,
102 |             content=data_model.content,
103 |             embedding=embedding,
104 |             platform=data_model.platform,
105 |             name=data_model.name,
106 |             link=data_model.link,
107 |             document_id=data_model.document_id,
108 |             author_id=data_model.author_id,
109 |             author_full_name=data_model.author_full_name,
110 |             metadata={
111 |                 "embedding_model_id": embedding_model.model_id,
112 |                 "embedding_size": embedding_model.embedding_size,
113 |                 "max_input_length": embedding_model.max_input_length,
114 |             },
115 |         )
116 | 


--------------------------------------------------------------------------------
/llm_engineering/application/preprocessing/operations/__init__.py:
--------------------------------------------------------------------------------
1 | from .chunking import chunk_article, chunk_text
2 | from .cleaning import clean_text
3 | 
4 | __all__ = [
5 |     "chunk_article",
6 |     "chunk_text",
7 |     "clean_text",
8 | ]
9 | 


--------------------------------------------------------------------------------
/llm_engineering/application/preprocessing/operations/chunking.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
 4 | 
 5 | from llm_engineering.application.networks import EmbeddingModelSingleton
 6 | 
 7 | embedding_model = EmbeddingModelSingleton()
 8 | 
 9 | 
10 | def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> list[str]:
11 |     character_splitter = RecursiveCharacterTextSplitter(separators=["\n\n"], chunk_size=chunk_size, chunk_overlap=0)
12 |     text_split_by_characters = character_splitter.split_text(text)
13 | 
14 |     token_splitter = SentenceTransformersTokenTextSplitter(
15 |         chunk_overlap=chunk_overlap,
16 |         tokens_per_chunk=embedding_model.max_input_length,
17 |         model_name=embedding_model.model_id,
18 |     )
19 |     chunks_by_tokens = []
20 |     for section in text_split_by_characters:
21 |         chunks_by_tokens.extend(token_splitter.split_text(section))
22 | 
23 |     return chunks_by_tokens
24 | 
25 | 
26 | def chunk_document(text: str, min_length: int, max_length: int) -> list[str]:
27 |     """Alias for chunk_article()."""
28 | 
29 |     return chunk_article(text, min_length, max_length)
30 | 
31 | 
32 | def chunk_article(text: str, min_length: int, max_length: int) -> list[str]:
33 |     sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", text)
34 | 
35 |     extracts = []
36 |     current_chunk = ""
37 |     for sentence in sentences:
38 |         sentence = sentence.strip()
39 |         if not sentence:
40 |             continue
41 | 
42 |         if len(current_chunk) + len(sentence) <= max_length:
43 |             current_chunk += sentence + " "
44 |         else:
45 |             if len(current_chunk) >= min_length:
46 |                 extracts.append(current_chunk.strip())
47 |             current_chunk = sentence + " "
48 | 
49 |     if len(current_chunk) >= min_length:
50 |         extracts.append(current_chunk.strip())
51 | 
52 |     return extracts
53 | 


--------------------------------------------------------------------------------
/llm_engineering/application/preprocessing/operations/cleaning.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | 
4 | def clean_text(text: str) -> str:
5 |     text = re.sub(r"[^\w\s.,!?]", " ", text)
6 |     text = re.sub(r"\s+", " ", text)
7 | 
8 |     return text.strip()
9 | 


--------------------------------------------------------------------------------
/llm_engineering/application/rag/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/application/rag/__init__.py


--------------------------------------------------------------------------------
/llm_engineering/application/rag/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any
 3 | 
 4 | from langchain.prompts import PromptTemplate
 5 | from pydantic import BaseModel
 6 | 
 7 | from llm_engineering.domain.queries import Query
 8 | 
 9 | 
10 | class PromptTemplateFactory(ABC, BaseModel):
11 |     @abstractmethod
12 |     def create_template(self) -> PromptTemplate:
13 |         pass
14 | 
15 | 
16 | class RAGStep(ABC):
17 |     def __init__(self, mock: bool = False) -> None:
18 |         self._mock = mock
19 | 
20 |     @abstractmethod
21 |     def generate(self, query: Query, *args, **kwargs) -> Any:
22 |         pass
23 | 


--------------------------------------------------------------------------------
/llm_engineering/application/rag/prompt_templates.py:
--------------------------------------------------------------------------------
 1 | from langchain.prompts import PromptTemplate
 2 | 
 3 | from .base import PromptTemplateFactory
 4 | 
 5 | 
 6 | class QueryExpansionTemplate(PromptTemplateFactory):
 7 |     prompt: str = """You are an AI language model assistant. Your task is to generate {expand_to_n}
 8 |     different versions of the given user question to retrieve relevant documents from a vector
 9 |     database. By generating multiple perspectives on the user question, your goal is to help
10 |     the user overcome some of the limitations of the distance-based similarity search.
11 |     Provide these alternative questions seperated by '{separator}'.
12 |     Original question: {question}"""
13 | 
14 |     @property
15 |     def separator(self) -> str:
16 |         return "#next-question#"
17 | 
18 |     def create_template(self, expand_to_n: int) -> PromptTemplate:
19 |         return PromptTemplate(
20 |             template=self.prompt,
21 |             input_variables=["question"],
22 |             partial_variables={
23 |                 "separator": self.separator,
24 |                 "expand_to_n": expand_to_n,
25 |             },
26 |         )
27 | 
28 | 
29 | class SelfQueryTemplate(PromptTemplateFactory):
30 |     prompt: str = """You are an AI language model assistant. Your task is to extract information from a user question.
31 |     The required information that needs to be extracted is the user name or user id. 
32 |     Your response should consists of only the extracted user name (e.g., John Doe) or id (e.g. 1345256), nothing else.
33 |     If the user question does not contain any user name or id, you should return the following token: none.
34 |     
35 |     For example:
36 |     QUESTION 1:
37 |     My name is Paul Iusztin and I want a post about...
38 |     RESPONSE 1:
39 |     Paul Iusztin
40 |     
41 |     QUESTION 2:
42 |     I want to write a post about...
43 |     RESPONSE 2:
44 |     none
45 |     
46 |     QUESTION 3:
47 |     My user id is 1345256 and I want to write a post about...
48 |     RESPONSE 3:
49 |     1345256
50 |     
51 |     User question: {question}"""
52 | 
53 |     def create_template(self) -> PromptTemplate:
54 |         return PromptTemplate(template=self.prompt, input_variables=["question"])
55 | 


--------------------------------------------------------------------------------
/llm_engineering/application/rag/query_expanison.py:
--------------------------------------------------------------------------------
 1 | import opik
 2 | from langchain_openai import ChatOpenAI
 3 | from loguru import logger
 4 | 
 5 | from llm_engineering.domain.queries import Query
 6 | from llm_engineering.settings import settings
 7 | 
 8 | from .base import RAGStep
 9 | from .prompt_templates import QueryExpansionTemplate
10 | 
11 | 
12 | class QueryExpansion(RAGStep):
13 |     @opik.track(name="QueryExpansion.generate")
14 |     def generate(self, query: Query, expand_to_n: int) -> list[Query]:
15 |         assert expand_to_n > 0, f"'expand_to_n' should be greater than 0. Got {expand_to_n}."
16 | 
17 |         if self._mock:
18 |             return [query for _ in range(expand_to_n)]
19 | 
20 |         query_expansion_template = QueryExpansionTemplate()
21 |         prompt = query_expansion_template.create_template(expand_to_n - 1)
22 |         model = ChatOpenAI(model=settings.OPENAI_MODEL_ID, api_key=settings.OPENAI_API_KEY, temperature=0)
23 | 
24 |         chain = prompt | model
25 | 
26 |         response = chain.invoke({"question": query})
27 |         result = response.content
28 | 
29 |         queries_content = result.strip().split(query_expansion_template.separator)
30 | 
31 |         queries = [query]
32 |         queries += [
33 |             query.replace_content(stripped_content)
34 |             for content in queries_content
35 |             if (stripped_content := content.strip())
36 |         ]
37 | 
38 |         return queries
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     query = Query.from_str("Write an article about the best types of advanced RAG methods.")
43 |     query_expander = QueryExpansion()
44 |     expanded_queries = query_expander.generate(query, expand_to_n=3)
45 |     for expanded_query in expanded_queries:
46 |         logger.info(expanded_query.content)
47 | 


--------------------------------------------------------------------------------
/llm_engineering/application/rag/reranking.py:
--------------------------------------------------------------------------------
 1 | import opik
 2 | 
 3 | from llm_engineering.application.networks import CrossEncoderModelSingleton
 4 | from llm_engineering.domain.embedded_chunks import EmbeddedChunk
 5 | from llm_engineering.domain.queries import Query
 6 | 
 7 | from .base import RAGStep
 8 | 
 9 | 
10 | class Reranker(RAGStep):
11 |     def __init__(self, mock: bool = False) -> None:
12 |         super().__init__(mock=mock)
13 | 
14 |         self._model = CrossEncoderModelSingleton()
15 | 
16 |     @opik.track(name="Reranker.generate")
17 |     def generate(self, query: Query, chunks: list[EmbeddedChunk], keep_top_k: int) -> list[EmbeddedChunk]:
18 |         if self._mock:
19 |             return chunks
20 | 
21 |         query_doc_tuples = [(query.content, chunk.content) for chunk in chunks]
22 |         scores = self._model(query_doc_tuples)
23 | 
24 |         scored_query_doc_tuples = list(zip(scores, chunks, strict=False))
25 |         scored_query_doc_tuples.sort(key=lambda x: x[0], reverse=True)
26 | 
27 |         reranked_documents = scored_query_doc_tuples[:keep_top_k]
28 |         reranked_documents = [doc for _, doc in reranked_documents]
29 | 
30 |         return reranked_documents
31 | 


--------------------------------------------------------------------------------
/llm_engineering/application/rag/retriever.py:
--------------------------------------------------------------------------------
  1 | import concurrent.futures
  2 | 
  3 | import opik
  4 | from loguru import logger
  5 | from qdrant_client.models import FieldCondition, Filter, MatchValue
  6 | 
  7 | from llm_engineering.application import utils
  8 | from llm_engineering.application.preprocessing.dispatchers import EmbeddingDispatcher
  9 | from llm_engineering.domain.embedded_chunks import (
 10 |     EmbeddedArticleChunk,
 11 |     EmbeddedChunk,
 12 |     EmbeddedPostChunk,
 13 |     EmbeddedRepositoryChunk,
 14 | )
 15 | from llm_engineering.domain.queries import EmbeddedQuery, Query
 16 | 
 17 | from .query_expanison import QueryExpansion
 18 | from .reranking import Reranker
 19 | from .self_query import SelfQuery
 20 | 
 21 | 
 22 | class ContextRetriever:
 23 |     def __init__(self, mock: bool = False) -> None:
 24 |         self._query_expander = QueryExpansion(mock=mock)
 25 |         self._metadata_extractor = SelfQuery(mock=mock)
 26 |         self._reranker = Reranker(mock=mock)
 27 | 
 28 |     @opik.track(name="ContextRetriever.search")
 29 |     def search(
 30 |         self,
 31 |         query: str,
 32 |         k: int = 3,
 33 |         expand_to_n_queries: int = 3,
 34 |     ) -> list:
 35 |         query_model = Query.from_str(query)
 36 | 
 37 |         query_model = self._metadata_extractor.generate(query_model)
 38 |         logger.info(
 39 |             f"Successfully extracted the author_full_name = {query_model.author_full_name} from the query.",
 40 |         )
 41 | 
 42 |         n_generated_queries = self._query_expander.generate(query_model, expand_to_n=expand_to_n_queries)
 43 |         logger.info(
 44 |             f"Successfully generated {len(n_generated_queries)} search queries.",
 45 |         )
 46 | 
 47 |         with concurrent.futures.ThreadPoolExecutor() as executor:
 48 |             search_tasks = [executor.submit(self._search, _query_model, k) for _query_model in n_generated_queries]
 49 | 
 50 |             n_k_documents = [task.result() for task in concurrent.futures.as_completed(search_tasks)]
 51 |             n_k_documents = utils.misc.flatten(n_k_documents)
 52 |             n_k_documents = list(set(n_k_documents))
 53 | 
 54 |         logger.info(f"{len(n_k_documents)} documents retrieved successfully")
 55 | 
 56 |         if len(n_k_documents) > 0:
 57 |             k_documents = self.rerank(query, chunks=n_k_documents, keep_top_k=k)
 58 |         else:
 59 |             k_documents = []
 60 | 
 61 |         return k_documents
 62 | 
 63 |     def _search(self, query: Query, k: int = 3) -> list[EmbeddedChunk]:
 64 |         assert k >= 3, "k should be >= 3"
 65 | 
 66 |         def _search_data_category(
 67 |             data_category_odm: type[EmbeddedChunk], embedded_query: EmbeddedQuery
 68 |         ) -> list[EmbeddedChunk]:
 69 |             if embedded_query.author_id:
 70 |                 query_filter = Filter(
 71 |                     must=[
 72 |                         FieldCondition(
 73 |                             key="author_id",
 74 |                             match=MatchValue(
 75 |                                 value=str(embedded_query.author_id),
 76 |                             ),
 77 |                         )
 78 |                     ]
 79 |                 )
 80 |             else:
 81 |                 query_filter = None
 82 | 
 83 |             return data_category_odm.search(
 84 |                 query_vector=embedded_query.embedding,
 85 |                 limit=k // 3,
 86 |                 query_filter=query_filter,
 87 |             )
 88 | 
 89 |         embedded_query: EmbeddedQuery = EmbeddingDispatcher.dispatch(query)
 90 | 
 91 |         post_chunks = _search_data_category(EmbeddedPostChunk, embedded_query)
 92 |         articles_chunks = _search_data_category(EmbeddedArticleChunk, embedded_query)
 93 |         repositories_chunks = _search_data_category(EmbeddedRepositoryChunk, embedded_query)
 94 | 
 95 |         retrieved_chunks = post_chunks + articles_chunks + repositories_chunks
 96 | 
 97 |         return retrieved_chunks
 98 | 
 99 |     def rerank(self, query: str | Query, chunks: list[EmbeddedChunk], keep_top_k: int) -> list[EmbeddedChunk]:
100 |         if isinstance(query, str):
101 |             query = Query.from_str(query)
102 | 
103 |         reranked_documents = self._reranker.generate(query=query, chunks=chunks, keep_top_k=keep_top_k)
104 | 
105 |         logger.info(f"{len(reranked_documents)} documents reranked successfully.")
106 | 
107 |         return reranked_documents
108 | 


--------------------------------------------------------------------------------
/llm_engineering/application/rag/self_query.py:
--------------------------------------------------------------------------------
 1 | import opik
 2 | from langchain_openai import ChatOpenAI
 3 | from loguru import logger
 4 | 
 5 | from llm_engineering.application import utils
 6 | from llm_engineering.domain.documents import UserDocument
 7 | from llm_engineering.domain.queries import Query
 8 | from llm_engineering.settings import settings
 9 | 
10 | from .base import RAGStep
11 | from .prompt_templates import SelfQueryTemplate
12 | 
13 | 
14 | class SelfQuery(RAGStep):
15 |     @opik.track(name="SelfQuery.generate")
16 |     def generate(self, query: Query) -> Query:
17 |         if self._mock:
18 |             return query
19 | 
20 |         prompt = SelfQueryTemplate().create_template()
21 |         model = ChatOpenAI(model=settings.OPENAI_MODEL_ID, api_key=settings.OPENAI_API_KEY, temperature=0)
22 | 
23 |         chain = prompt | model
24 | 
25 |         response = chain.invoke({"question": query})
26 |         user_full_name = response.content.strip("\n ")
27 | 
28 |         if user_full_name == "none":
29 |             return query
30 | 
31 |         first_name, last_name = utils.split_user_full_name(user_full_name)
32 |         user = UserDocument.get_or_create(first_name=first_name, last_name=last_name)
33 | 
34 |         query.author_id = user.id
35 |         query.author_full_name = user.full_name
36 | 
37 |         return query
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     query = Query.from_str("I am Paul Iusztin. Write an article about the best types of advanced RAG methods.")
42 |     self_query = SelfQuery()
43 |     query = self_query.generate(query)
44 |     logger.info(f"Extracted author_id: {query.author_id}")
45 |     logger.info(f"Extracted author_full_name: {query.author_full_name}")
46 | 


--------------------------------------------------------------------------------
/llm_engineering/application/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from . import misc
2 | from .split_user_full_name import split_user_full_name
3 | 
4 | __all__ = ["misc", "split_user_full_name"]
5 | 


--------------------------------------------------------------------------------
/llm_engineering/application/utils/misc.py:
--------------------------------------------------------------------------------
 1 | from typing import Generator
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | from llm_engineering.settings import settings
 6 | 
 7 | 
 8 | def flatten(nested_list: list) -> list:
 9 |     """Flatten a list of lists into a single list."""
10 | 
11 |     return [item for sublist in nested_list for item in sublist]
12 | 
13 | 
14 | def batch(list_: list, size: int) -> Generator[list, None, None]:
15 |     yield from (list_[i : i + size] for i in range(0, len(list_), size))
16 | 
17 | 
18 | def compute_num_tokens(text: str) -> int:
19 |     tokenizer = AutoTokenizer.from_pretrained(settings.HF_MODEL_ID)
20 | 
21 |     return len(tokenizer.encode(text, add_special_tokens=False))
22 | 


--------------------------------------------------------------------------------
/llm_engineering/application/utils/split_user_full_name.py:
--------------------------------------------------------------------------------
 1 | from llm_engineering.domain.exceptions import ImproperlyConfigured
 2 | 
 3 | 
 4 | def split_user_full_name(user: str | None) -> tuple[str, str]:
 5 |     if user is None:
 6 |         raise ImproperlyConfigured("User name is empty")
 7 | 
 8 |     name_tokens = user.split(" ")
 9 |     if len(name_tokens) == 0:
10 |         raise ImproperlyConfigured("User name is empty")
11 |     elif len(name_tokens) == 1:
12 |         first_name, last_name = name_tokens[0], name_tokens[0]
13 |     else:
14 |         first_name, last_name = " ".join(name_tokens[:-1]), name_tokens[-1]
15 | 
16 |     return first_name, last_name
17 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import base, chunks, cleaned_documents, dataset, documents, embedded_chunks, exceptions, inference, prompt, types
 2 | 
 3 | __all__ = [
 4 |     "base",
 5 |     "chunks",
 6 |     "cleaned_documents",
 7 |     "dataset",
 8 |     "documents",
 9 |     "embedded_chunks",
10 |     "exceptions",
11 |     "inference",
12 |     "types",
13 |     "prompt",
14 | ]
15 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .nosql import NoSQLBaseDocument
2 | from .vector import VectorBaseDocument
3 | 
4 | __all__ = ["NoSQLBaseDocument", "VectorBaseDocument"]
5 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/base/nosql.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | from abc import ABC
  3 | from typing import Generic, Type, TypeVar
  4 | 
  5 | from loguru import logger
  6 | from pydantic import UUID4, BaseModel, Field
  7 | from pymongo import errors
  8 | 
  9 | from llm_engineering.domain.exceptions import ImproperlyConfigured
 10 | from llm_engineering.infrastructure.db.mongo import connection
 11 | from llm_engineering.settings import settings
 12 | 
 13 | _database = connection.get_database(settings.DATABASE_NAME)
 14 | 
 15 | 
 16 | T = TypeVar("T", bound="NoSQLBaseDocument")
 17 | 
 18 | 
 19 | class NoSQLBaseDocument(BaseModel, Generic[T], ABC):
 20 |     id: UUID4 = Field(default_factory=uuid.uuid4)
 21 | 
 22 |     def __eq__(self, value: object) -> bool:
 23 |         if not isinstance(value, self.__class__):
 24 |             return False
 25 | 
 26 |         return self.id == value.id
 27 | 
 28 |     def __hash__(self) -> int:
 29 |         return hash(self.id)
 30 | 
 31 |     @classmethod
 32 |     def from_mongo(cls: Type[T], data: dict) -> T:
 33 |         """Convert "_id" (str object) into "id" (UUID object)."""
 34 | 
 35 |         if not data:
 36 |             raise ValueError("Data is empty.")
 37 | 
 38 |         id = data.pop("_id")
 39 | 
 40 |         return cls(**dict(data, id=id))
 41 | 
 42 |     def to_mongo(self: T, **kwargs) -> dict:
 43 |         """Convert "id" (UUID object) into "_id" (str object)."""
 44 |         exclude_unset = kwargs.pop("exclude_unset", False)
 45 |         by_alias = kwargs.pop("by_alias", True)
 46 | 
 47 |         parsed = self.model_dump(exclude_unset=exclude_unset, by_alias=by_alias, **kwargs)
 48 | 
 49 |         if "_id" not in parsed and "id" in parsed:
 50 |             parsed["_id"] = str(parsed.pop("id"))
 51 | 
 52 |         for key, value in parsed.items():
 53 |             if isinstance(value, uuid.UUID):
 54 |                 parsed[key] = str(value)
 55 | 
 56 |         return parsed
 57 | 
 58 |     def model_dump(self: T, **kwargs) -> dict:
 59 |         dict_ = super().model_dump(**kwargs)
 60 | 
 61 |         for key, value in dict_.items():
 62 |             if isinstance(value, uuid.UUID):
 63 |                 dict_[key] = str(value)
 64 | 
 65 |         return dict_
 66 | 
 67 |     def save(self: T, **kwargs) -> T | None:
 68 |         collection = _database[self.get_collection_name()]
 69 |         try:
 70 |             collection.insert_one(self.to_mongo(**kwargs))
 71 | 
 72 |             return self
 73 |         except errors.WriteError:
 74 |             logger.exception("Failed to insert document.")
 75 | 
 76 |             return None
 77 | 
 78 |     @classmethod
 79 |     def get_or_create(cls: Type[T], **filter_options) -> T:
 80 |         collection = _database[cls.get_collection_name()]
 81 |         try:
 82 |             instance = collection.find_one(filter_options)
 83 |             if instance:
 84 |                 return cls.from_mongo(instance)
 85 | 
 86 |             new_instance = cls(**filter_options)
 87 |             new_instance = new_instance.save()
 88 | 
 89 |             return new_instance
 90 |         except errors.OperationFailure:
 91 |             logger.exception(f"Failed to retrieve document with filter options: {filter_options}")
 92 | 
 93 |             raise
 94 | 
 95 |     @classmethod
 96 |     def bulk_insert(cls: Type[T], documents: list[T], **kwargs) -> bool:
 97 |         collection = _database[cls.get_collection_name()]
 98 |         try:
 99 |             collection.insert_many(doc.to_mongo(**kwargs) for doc in documents)
100 | 
101 |             return True
102 |         except (errors.WriteError, errors.BulkWriteError):
103 |             logger.error(f"Failed to insert documents of type {cls.__name__}")
104 | 
105 |             return False
106 | 
107 |     @classmethod
108 |     def find(cls: Type[T], **filter_options) -> T | None:
109 |         collection = _database[cls.get_collection_name()]
110 |         try:
111 |             instance = collection.find_one(filter_options)
112 |             if instance:
113 |                 return cls.from_mongo(instance)
114 | 
115 |             return None
116 |         except errors.OperationFailure:
117 |             logger.error("Failed to retrieve document")
118 | 
119 |             return None
120 | 
121 |     @classmethod
122 |     def bulk_find(cls: Type[T], **filter_options) -> list[T]:
123 |         collection = _database[cls.get_collection_name()]
124 |         try:
125 |             instances = collection.find(filter_options)
126 |             return [document for instance in instances if (document := cls.from_mongo(instance)) is not None]
127 |         except errors.OperationFailure:
128 |             logger.error("Failed to retrieve documents")
129 | 
130 |             return []
131 | 
132 |     @classmethod
133 |     def get_collection_name(cls: Type[T]) -> str:
134 |         if not hasattr(cls, "Settings") or not hasattr(cls.Settings, "name"):
135 |             raise ImproperlyConfigured(
136 |                 "Document should define an Settings configuration class with the name of the collection."
137 |             )
138 | 
139 |         return cls.Settings.name
140 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/chunks.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from typing import Optional
 3 | 
 4 | from pydantic import UUID4, Field
 5 | 
 6 | from llm_engineering.domain.base import VectorBaseDocument
 7 | from llm_engineering.domain.types import DataCategory
 8 | 
 9 | 
10 | class Chunk(VectorBaseDocument, ABC):
11 |     content: str
12 |     platform: str
13 |     document_id: UUID4
14 |     author_id: UUID4
15 |     author_full_name: str
16 |     metadata: dict = Field(default_factory=dict)
17 | 
18 | 
19 | class PostChunk(Chunk):
20 |     image: Optional[str] = None
21 | 
22 |     class Config:
23 |         category = DataCategory.POSTS
24 | 
25 | 
26 | class ArticleChunk(Chunk):
27 |     link: str
28 | 
29 |     class Config:
30 |         category = DataCategory.ARTICLES
31 | 
32 | 
33 | class RepositoryChunk(Chunk):
34 |     name: str
35 |     link: str
36 | 
37 |     class Config:
38 |         category = DataCategory.REPOSITORIES
39 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/cleaned_documents.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from typing import Optional
 3 | 
 4 | from pydantic import UUID4
 5 | 
 6 | from .base import VectorBaseDocument
 7 | from .types import DataCategory
 8 | 
 9 | 
10 | class CleanedDocument(VectorBaseDocument, ABC):
11 |     content: str
12 |     platform: str
13 |     author_id: UUID4
14 |     author_full_name: str
15 | 
16 | 
17 | class CleanedPostDocument(CleanedDocument):
18 |     image: Optional[str] = None
19 | 
20 |     class Config:
21 |         name = "cleaned_posts"
22 |         category = DataCategory.POSTS
23 |         use_vector_index = False
24 | 
25 | 
26 | class CleanedArticleDocument(CleanedDocument):
27 |     link: str
28 | 
29 |     class Config:
30 |         name = "cleaned_articles"
31 |         category = DataCategory.ARTICLES
32 |         use_vector_index = False
33 | 
34 | 
35 | class CleanedRepositoryDocument(CleanedDocument):
36 |     name: str
37 |     link: str
38 | 
39 |     class Config:
40 |         name = "cleaned_repositories"
41 |         category = DataCategory.REPOSITORIES
42 |         use_vector_index = False
43 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/dataset.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | from loguru import logger
  4 | 
  5 | try:
  6 |     from datasets import Dataset, DatasetDict, concatenate_datasets
  7 | except ImportError:
  8 |     logger.warning("Huggingface datasets not installed. Install with `pip install datasets`")
  9 | 
 10 | 
 11 | from llm_engineering.domain.base import VectorBaseDocument
 12 | from llm_engineering.domain.types import DataCategory
 13 | 
 14 | 
 15 | class DatasetType(Enum):
 16 |     INSTRUCTION = "instruction"
 17 |     PREFERENCE = "preference"
 18 | 
 19 | 
 20 | class InstructDatasetSample(VectorBaseDocument):
 21 |     instruction: str
 22 |     answer: str
 23 | 
 24 |     class Config:
 25 |         category = DataCategory.INSTRUCT_DATASET_SAMPLES
 26 | 
 27 | 
 28 | class PreferenceDatasetSample(VectorBaseDocument):
 29 |     instruction: str
 30 |     rejected: str
 31 |     chosen: str
 32 | 
 33 |     class Config:
 34 |         category = DataCategory.PREFERENCE_DATASET_SAMPLES
 35 | 
 36 | 
 37 | class InstructDataset(VectorBaseDocument):
 38 |     category: DataCategory
 39 |     samples: list[InstructDatasetSample]
 40 | 
 41 |     class Config:
 42 |         category = DataCategory.INSTRUCT_DATASET
 43 | 
 44 |     @property
 45 |     def num_samples(self) -> int:
 46 |         return len(self.samples)
 47 | 
 48 |     def to_huggingface(self) -> "Dataset":
 49 |         data = [sample.model_dump() for sample in self.samples]
 50 | 
 51 |         return Dataset.from_dict(
 52 |             {"instruction": [d["instruction"] for d in data], "output": [d["answer"] for d in data]}
 53 |         )
 54 | 
 55 | 
 56 | class TrainTestSplit(VectorBaseDocument):
 57 |     train: dict
 58 |     test: dict
 59 |     test_split_size: float
 60 | 
 61 |     def to_huggingface(self, flatten: bool = False) -> "DatasetDict":
 62 |         train_datasets = {category.value: dataset.to_huggingface() for category, dataset in self.train.items()}
 63 |         test_datasets = {category.value: dataset.to_huggingface() for category, dataset in self.test.items()}
 64 | 
 65 |         if flatten:
 66 |             train_datasets = concatenate_datasets(list(train_datasets.values()))
 67 |             test_datasets = concatenate_datasets(list(test_datasets.values()))
 68 |         else:
 69 |             train_datasets = Dataset.from_dict(train_datasets)
 70 |             test_datasets = Dataset.from_dict(test_datasets)
 71 | 
 72 |         return DatasetDict({"train": train_datasets, "test": test_datasets})
 73 | 
 74 | 
 75 | class InstructTrainTestSplit(TrainTestSplit):
 76 |     train: dict[DataCategory, InstructDataset]
 77 |     test: dict[DataCategory, InstructDataset]
 78 |     test_split_size: float
 79 | 
 80 |     class Config:
 81 |         category = DataCategory.INSTRUCT_DATASET
 82 | 
 83 | 
 84 | class PreferenceDataset(VectorBaseDocument):
 85 |     category: DataCategory
 86 |     samples: list[PreferenceDatasetSample]
 87 | 
 88 |     class Config:
 89 |         category = DataCategory.PREFERENCE_DATASET
 90 | 
 91 |     @property
 92 |     def num_samples(self) -> int:
 93 |         return len(self.samples)
 94 | 
 95 |     def to_huggingface(self) -> "Dataset":
 96 |         data = [sample.model_dump() for sample in self.samples]
 97 | 
 98 |         return Dataset.from_dict(
 99 |             {
100 |                 "prompt": [d["instruction"] for d in data],
101 |                 "rejected": [d["rejected"] for d in data],
102 |                 "chosen": [d["chosen"] for d in data],
103 |             }
104 |         )
105 | 
106 | 
107 | class PreferenceTrainTestSplit(TrainTestSplit):
108 |     train: dict[DataCategory, PreferenceDataset]
109 |     test: dict[DataCategory, PreferenceDataset]
110 |     test_split_size: float
111 | 
112 |     class Config:
113 |         category = DataCategory.PREFERENCE_DATASET
114 | 
115 | 
116 | def build_dataset(dataset_type, *args, **kwargs) -> InstructDataset | PreferenceDataset:
117 |     if dataset_type == DatasetType.INSTRUCTION:
118 |         return InstructDataset(*args, **kwargs)
119 |     elif dataset_type == DatasetType.PREFERENCE:
120 |         return PreferenceDataset(*args, **kwargs)
121 |     else:
122 |         raise ValueError(f"Invalid dataset type: {dataset_type}")
123 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/documents.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from typing import Optional
 3 | 
 4 | from pydantic import UUID4, Field
 5 | 
 6 | from .base import NoSQLBaseDocument
 7 | from .types import DataCategory
 8 | 
 9 | 
10 | class UserDocument(NoSQLBaseDocument):
11 |     first_name: str
12 |     last_name: str
13 | 
14 |     class Settings:
15 |         name = "users"
16 | 
17 |     @property
18 |     def full_name(self):
19 |         return f"{self.first_name} {self.last_name}"
20 | 
21 | 
22 | class Document(NoSQLBaseDocument, ABC):
23 |     content: dict
24 |     platform: str
25 |     author_id: UUID4 = Field(alias="author_id")
26 |     author_full_name: str = Field(alias="author_full_name")
27 | 
28 | 
29 | class RepositoryDocument(Document):
30 |     name: str
31 |     link: str
32 | 
33 |     class Settings:
34 |         name = DataCategory.REPOSITORIES
35 | 
36 | 
37 | class PostDocument(Document):
38 |     image: Optional[str] = None
39 |     link: str | None = None
40 | 
41 |     class Settings:
42 |         name = DataCategory.POSTS
43 | 
44 | 
45 | class ArticleDocument(Document):
46 |     link: str
47 | 
48 |     class Settings:
49 |         name = DataCategory.ARTICLES
50 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/embedded_chunks.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | 
 3 | from pydantic import UUID4, Field
 4 | 
 5 | from llm_engineering.domain.types import DataCategory
 6 | 
 7 | from .base import VectorBaseDocument
 8 | 
 9 | 
10 | class EmbeddedChunk(VectorBaseDocument, ABC):
11 |     content: str
12 |     embedding: list[float] | None
13 |     platform: str
14 |     document_id: UUID4
15 |     author_id: UUID4
16 |     author_full_name: str
17 |     metadata: dict = Field(default_factory=dict)
18 | 
19 |     @classmethod
20 |     def to_context(cls, chunks: list["EmbeddedChunk"]) -> str:
21 |         context = ""
22 |         for i, chunk in enumerate(chunks):
23 |             context += f"""
24 |             Chunk {i + 1}:
25 |             Type: {chunk.__class__.__name__}
26 |             Platform: {chunk.platform}
27 |             Author: {chunk.author_full_name}
28 |             Content: {chunk.content}\n
29 |             """
30 | 
31 |         return context
32 | 
33 | 
34 | class EmbeddedPostChunk(EmbeddedChunk):
35 |     class Config:
36 |         name = "embedded_posts"
37 |         category = DataCategory.POSTS
38 |         use_vector_index = True
39 | 
40 | 
41 | class EmbeddedArticleChunk(EmbeddedChunk):
42 |     link: str
43 | 
44 |     class Config:
45 |         name = "embedded_articles"
46 |         category = DataCategory.ARTICLES
47 |         use_vector_index = True
48 | 
49 | 
50 | class EmbeddedRepositoryChunk(EmbeddedChunk):
51 |     name: str
52 |     link: str
53 | 
54 |     class Config:
55 |         name = "embedded_repositories"
56 |         category = DataCategory.REPOSITORIES
57 |         use_vector_index = True
58 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/exceptions.py:
--------------------------------------------------------------------------------
1 | class LLMTwinException(Exception):
2 |     pass
3 | 
4 | 
5 | class ImproperlyConfigured(LLMTwinException):
6 |     pass
7 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/inference.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class DeploymentStrategy(ABC):
 5 |     @abstractmethod
 6 |     def deploy(self, model, endpoint_name: str, endpoint_config_name: str) -> None:
 7 |         pass
 8 | 
 9 | 
10 | class Inference(ABC):
11 |     """An abstract class for performing inference."""
12 | 
13 |     def __init__(self):
14 |         self.model = None
15 | 
16 |     @abstractmethod
17 |     def set_payload(self, inputs, parameters=None):
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def inference(self):
22 |         pass
23 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/prompt.py:
--------------------------------------------------------------------------------
 1 | from llm_engineering.domain.base import VectorBaseDocument
 2 | from llm_engineering.domain.cleaned_documents import CleanedDocument
 3 | from llm_engineering.domain.types import DataCategory
 4 | 
 5 | 
 6 | class Prompt(VectorBaseDocument):
 7 |     template: str
 8 |     input_variables: dict
 9 |     content: str
10 |     num_tokens: int | None = None
11 | 
12 |     class Config:
13 |         category = DataCategory.PROMPT
14 | 
15 | 
16 | class GenerateDatasetSamplesPrompt(Prompt):
17 |     data_category: DataCategory
18 |     document: CleanedDocument
19 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/queries.py:
--------------------------------------------------------------------------------
 1 | from pydantic import UUID4, Field
 2 | 
 3 | from llm_engineering.domain.base import VectorBaseDocument
 4 | from llm_engineering.domain.types import DataCategory
 5 | 
 6 | 
 7 | class Query(VectorBaseDocument):
 8 |     content: str
 9 |     author_id: UUID4 | None = None
10 |     author_full_name: str | None = None
11 |     metadata: dict = Field(default_factory=dict)
12 | 
13 |     class Config:
14 |         category = DataCategory.QUERIES
15 | 
16 |     @classmethod
17 |     def from_str(cls, query: str) -> "Query":
18 |         return Query(content=query.strip("\n "))
19 | 
20 |     def replace_content(self, new_content: str) -> "Query":
21 |         return Query(
22 |             id=self.id,
23 |             content=new_content,
24 |             author_id=self.author_id,
25 |             author_full_name=self.author_full_name,
26 |             metadata=self.metadata,
27 |         )
28 | 
29 | 
30 | class EmbeddedQuery(Query):
31 |     embedding: list[float]
32 | 
33 |     class Config:
34 |         category = DataCategory.QUERIES
35 | 


--------------------------------------------------------------------------------
/llm_engineering/domain/types.py:
--------------------------------------------------------------------------------
 1 | from enum import StrEnum
 2 | 
 3 | 
 4 | class DataCategory(StrEnum):
 5 |     PROMPT = "prompt"
 6 |     QUERIES = "queries"
 7 | 
 8 |     INSTRUCT_DATASET_SAMPLES = "instruct_dataset_samples"
 9 |     INSTRUCT_DATASET = "instruct_dataset"
10 |     PREFERENCE_DATASET_SAMPLES = "preference_dataset_samples"
11 |     PREFERENCE_DATASET = "preference_dataset"
12 | 
13 |     POSTS = "posts"
14 |     ARTICLES = "articles"
15 |     REPOSITORIES = "repositories"
16 | 


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/infrastructure/__init__.py


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/aws/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/infrastructure/aws/__init__.py


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/aws/deploy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/infrastructure/aws/deploy/__init__.py


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/aws/deploy/delete_sagemaker_endpoint.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | 
 3 | try:
 4 |     import boto3
 5 |     from botocore.exceptions import ClientError
 6 | except ModuleNotFoundError:
 7 |     logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.")
 8 | 
 9 | 
10 | from llm_engineering.settings import settings
11 | 
12 | 
13 | def delete_endpoint_and_config(endpoint_name) -> None:
14 |     """
15 |     Deletes an AWS SageMaker endpoint and its associated configuration.
16 |     Args:
17 |     endpoint_name (str): The name of the SageMaker endpoint to delete.
18 |     Returns:
19 |     None
20 |     """
21 | 
22 |     try:
23 |         sagemaker_client = boto3.client(
24 |             "sagemaker",
25 |             region_name=settings.AWS_REGION,
26 |             aws_access_key_id=settings.AWS_ACCESS_KEY,
27 |             aws_secret_access_key=settings.AWS_SECRET_KEY,
28 |         )
29 |     except Exception:
30 |         logger.exception("Error creating SageMaker client")
31 | 
32 |         return
33 | 
34 |     # Get the endpoint configuration name
35 |     try:
36 |         response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
37 |         config_name = response["EndpointConfigName"]
38 |     except ClientError:
39 |         logger.error("Error getting endpoint configuration and modelname.")
40 | 
41 |         return
42 | 
43 |     # Delete the endpoint
44 |     try:
45 |         sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
46 |         logger.info(f"Endpoint '{endpoint_name}' deletion initiated.")
47 |     except ClientError:
48 |         logger.error("Error deleting endpoint")
49 | 
50 |     try:
51 |         response = sagemaker_client.describe_endpoint_config(EndpointConfigName=endpoint_name)
52 |         model_name = response["ProductionVariants"][0]["ModelName"]
53 |     except ClientError:
54 |         logger.error("Error getting model name.")
55 | 
56 |     # Delete the endpoint configuration
57 |     try:
58 |         sagemaker_client.delete_endpoint_config(EndpointConfigName=config_name)
59 |         logger.info(f"Endpoint configuration '{config_name}' deleted.")
60 |     except ClientError:
61 |         logger.error("Error deleting endpoint configuration.")
62 | 
63 |     # Delete models
64 |     try:
65 |         sagemaker_client.delete_model(ModelName=model_name)
66 |         logger.info(f"Model '{model_name}' deleted.")
67 |     except ClientError:
68 |         logger.error("Error deleting model.")
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     endpoint_name = settings.SAGEMAKER_ENDPOINT_INFERENCE
73 |     logger.info(f"Attempting to delete endpoint: {endpoint_name}")
74 |     delete_endpoint_and_config(endpoint_name=endpoint_name)
75 | 


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/aws/deploy/huggingface/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/infrastructure/aws/deploy/huggingface/__init__.py


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/aws/deploy/huggingface/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from loguru import logger
 4 | 
 5 | try:
 6 |     from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements
 7 | except ModuleNotFoundError:
 8 |     logger.warning("Couldn't load SageMaker imports. Run 'poetry install --with aws' to support AWS.")
 9 | 
10 | from llm_engineering.settings import settings
11 | 
12 | hugging_face_deploy_config = {
13 |     "HF_MODEL_ID": settings.HF_MODEL_ID,
14 |     "HUGGING_FACE_HUB_TOKEN": settings.HUGGINGFACE_ACCESS_TOKEN,
15 |     "SM_NUM_GPUS": json.dumps(settings.SM_NUM_GPUS),  # Number of GPU used per replica
16 |     "MAX_INPUT_LENGTH": json.dumps(settings.MAX_INPUT_LENGTH),  # Max length of input text
17 |     "MAX_TOTAL_TOKENS": json.dumps(settings.MAX_TOTAL_TOKENS),  # Max length of the generation (including input text)
18 |     "MAX_BATCH_TOTAL_TOKENS": json.dumps(settings.MAX_BATCH_TOTAL_TOKENS),
19 |     "MAX_BATCH_PREFILL_TOKENS": json.dumps(settings.MAX_BATCH_TOTAL_TOKENS),
20 |     "HF_MODEL_QUANTIZE": "bitsandbytes",
21 | }
22 | 
23 | 
24 | model_resource_config = ResourceRequirements(
25 |     requests={
26 |         "copies": settings.COPIES,  # Number of replicas.
27 |         "num_accelerators": settings.GPUS,  # Number of GPUs required.
28 |         "num_cpus": settings.CPUS,  # Number of CPU cores required.
29 |         "memory": 5 * 1024,  # Minimum memory required in Mb (required)
30 |     },
31 | )
32 | 


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/aws/deploy/huggingface/run.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | 
 3 | try:
 4 |     from sagemaker.enums import EndpointType
 5 |     from sagemaker.huggingface import get_huggingface_llm_image_uri
 6 | except ModuleNotFoundError:
 7 |     logger.warning("Couldn't load SageMaker imports. Run 'poetry install --with aws' to support AWS.")
 8 | 
 9 | from llm_engineering.model.utils import ResourceManager
10 | from llm_engineering.settings import settings
11 | 
12 | from .config import hugging_face_deploy_config, model_resource_config
13 | from .sagemaker_huggingface import DeploymentService, SagemakerHuggingfaceStrategy
14 | 
15 | 
16 | def create_endpoint(endpoint_type=EndpointType.INFERENCE_COMPONENT_BASED) -> None:
17 |     assert settings.AWS_ARN_ROLE is not None, "AWS_ARN_ROLE is not set in the .env file."
18 | 
19 |     logger.info(f"Creating endpoint with endpoint_type = {endpoint_type} and model_id = {settings.HF_MODEL_ID}")
20 | 
21 |     llm_image = get_huggingface_llm_image_uri("huggingface", version="2.2.0")
22 | 
23 |     resource_manager = ResourceManager()
24 |     deployment_service = DeploymentService(resource_manager=resource_manager)
25 | 
26 |     SagemakerHuggingfaceStrategy(deployment_service).deploy(
27 |         role_arn=settings.AWS_ARN_ROLE,
28 |         llm_image=llm_image,
29 |         config=hugging_face_deploy_config,
30 |         endpoint_name=settings.SAGEMAKER_ENDPOINT_INFERENCE,
31 |         endpoint_config_name=settings.SAGEMAKER_ENDPOINT_CONFIG_INFERENCE,
32 |         gpu_instance_type=settings.GPU_INSTANCE_TYPE,
33 |         resources=model_resource_config,
34 |         endpoint_type=endpoint_type,
35 |     )
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     create_endpoint(endpoint_type=EndpointType.MODEL_BASED)
40 | 


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/aws/deploy/huggingface/sagemaker_huggingface.py:
--------------------------------------------------------------------------------
  1 | import enum
  2 | from typing import Optional
  3 | 
  4 | from loguru import logger
  5 | 
  6 | try:
  7 |     import boto3
  8 |     from sagemaker.enums import EndpointType
  9 |     from sagemaker.huggingface import HuggingFaceModel
 10 | except ModuleNotFoundError:
 11 |     logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.")
 12 | 
 13 | from llm_engineering.domain.inference import DeploymentStrategy
 14 | from llm_engineering.settings import settings
 15 | 
 16 | 
 17 | class SagemakerHuggingfaceStrategy(DeploymentStrategy):
 18 |     def __init__(self, deployment_service) -> None:
 19 |         """
 20 |         Initializes the deployment strategy with the necessary services.
 21 | 
 22 |         :param deployment_service: The service handling the deployment details.
 23 |         :param logger: Logger for logging information and errors.
 24 |         """
 25 |         self.deployment_service = deployment_service
 26 | 
 27 |     def deploy(
 28 |         self,
 29 |         role_arn: str,
 30 |         llm_image: str,
 31 |         config: dict,
 32 |         endpoint_name: str,
 33 |         endpoint_config_name: str,
 34 |         gpu_instance_type: str,
 35 |         resources: Optional[dict] = None,
 36 |         endpoint_type: enum.Enum = EndpointType.MODEL_BASED,
 37 |     ) -> None:
 38 |         """
 39 |         Initiates the deployment process for a HuggingFace model on AWS SageMaker.
 40 | 
 41 |         :param role_arn: AWS role ARN with permissions for SageMaker deployment.
 42 |         :param llm_image: URI for the HuggingFace model Docker image.
 43 |         :param config: Configuration settings for the model environment.
 44 |         :param endpoint_name: Name of the SageMaker endpoint.
 45 |         :param endpoint_config_name: Name of the SageMaker endpoint configuration.
 46 |         :param resources: Optional resources for the model deployment (used for multi model endpoints)
 47 |         :param endpoint_type: can be EndpointType.MODEL_BASED (without inference component)
 48 |                 or EndpointType.INFERENCE_COMPONENT (with inference component)
 49 | 
 50 |         """
 51 | 
 52 |         logger.info("Starting deployment using Sagemaker Huggingface Strategy...")
 53 |         logger.info(
 54 |             f"Deployment parameters: nb of replicas: {settings.COPIES}, nb of gpus:{settings.GPUS}, instance_type:{settings.GPU_INSTANCE_TYPE}"
 55 |         )
 56 |         try:
 57 |             # Delegate to the deployment service to handle the actual deployment details
 58 |             self.deployment_service.deploy(
 59 |                 role_arn=role_arn,
 60 |                 llm_image=llm_image,
 61 |                 config=config,
 62 |                 endpoint_name=endpoint_name,
 63 |                 endpoint_config_name=endpoint_config_name,
 64 |                 gpu_instance_type=gpu_instance_type,
 65 |                 resources=resources,
 66 |                 endpoint_type=endpoint_type,
 67 |             )
 68 |             logger.info("Deployment completed successfully.")
 69 |         except Exception as e:
 70 |             logger.error(f"Error during deployment: {e}")
 71 |             raise
 72 | 
 73 | 
 74 | class DeploymentService:
 75 |     def __init__(self, resource_manager):
 76 |         """
 77 |         Initializes the DeploymentService with necessary dependencies.
 78 | 
 79 |         :param resource_manager: Manages resources and configurations for deployments.
 80 |         :param settings: Configuration settings for deployment.
 81 |         :param logger: Optional logger for logging messages. If None, the standard logging module will be used.
 82 |         """
 83 | 
 84 |         self.sagemaker_client = boto3.client(
 85 |             "sagemaker",
 86 |             region_name=settings.AWS_REGION,
 87 |             aws_access_key_id=settings.AWS_ACCESS_KEY,
 88 |             aws_secret_access_key=settings.AWS_SECRET_KEY,
 89 |         )
 90 |         self.resource_manager = resource_manager
 91 | 
 92 |     def deploy(
 93 |         self,
 94 |         role_arn: str,
 95 |         llm_image: str,
 96 |         config: dict,
 97 |         endpoint_name: str,
 98 |         endpoint_config_name: str,
 99 |         gpu_instance_type: str,
100 |         resources: Optional[dict] = None,
101 |         endpoint_type: enum.Enum = EndpointType.MODEL_BASED,
102 |     ) -> None:
103 |         """
104 |         Handles the deployment of a model to SageMaker, including checking and creating
105 |         configurations and endpoints as necessary.
106 | 
107 |         :param role_arn: The ARN of the IAM role for SageMaker to access resources.
108 |         :param llm_image: URI of the Docker image in ECR for the HuggingFace model.
109 |         :param config: Configuration dictionary for the environment variables of the model.
110 |         :param endpoint_name: The name for the SageMaker endpoint.
111 |         :param endpoint_config_name: The name for the SageMaker endpoint configuration.
112 |         :param resources: Optional resources for the model deployment (used for multi model endpoints)
113 |         :param endpoint_type: can be EndpointType.MODEL_BASED (without inference component)
114 |                 or EndpointType.INFERENCE_COMPONENT (with inference component)
115 |         :param gpu_instance_type: The instance type for the SageMaker endpoint.
116 |         """
117 | 
118 |         try:
119 |             # Check if the endpoint configuration exists
120 |             if self.resource_manager.endpoint_config_exists(endpoint_config_name=endpoint_config_name):
121 |                 logger.info(f"Endpoint configuration {endpoint_config_name} exists. Using existing configuration...")
122 |             else:
123 |                 logger.info(f"Endpoint configuration{endpoint_config_name} does not exist.")
124 | 
125 |             # Prepare and deploy the HuggingFace model
126 |             self.prepare_and_deploy_model(
127 |                 role_arn=role_arn,
128 |                 llm_image=llm_image,
129 |                 config=config,
130 |                 endpoint_name=endpoint_name,
131 |                 update_endpoint=False,
132 |                 resources=resources,
133 |                 endpoint_type=endpoint_type,
134 |                 gpu_instance_type=gpu_instance_type,
135 |             )
136 | 
137 |             logger.info(f"Successfully deployed/updated model to endpoint {endpoint_name}.")
138 |         except Exception as e:
139 |             logger.error(f"Failed to deploy model to SageMaker: {e}")
140 | 
141 |             raise
142 | 
143 |     @staticmethod
144 |     def prepare_and_deploy_model(
145 |         role_arn: str,
146 |         llm_image: str,
147 |         config: dict,
148 |         endpoint_name: str,
149 |         update_endpoint: bool,
150 |         gpu_instance_type: str,
151 |         resources: Optional[dict] = None,
152 |         endpoint_type: enum.Enum = EndpointType.MODEL_BASED,
153 |     ) -> None:
154 |         """
155 |         Prepares and deploys/updates the HuggingFace model on SageMaker.
156 | 
157 |         :param role_arn: The ARN of the IAM role.
158 |         :param llm_image: The Docker image URI for the HuggingFace model.
159 |         :param config: Configuration settings for the model.
160 |         :param endpoint_name: The name of the endpoint.
161 |         :param update_endpoint: Boolean flag to update an existing endpoint.
162 |         :param gpu_instance_type: The instance type for the SageMaker endpoint.
163 |         :param resources: Optional resources for the model deployment(used for multi model endpoints)
164 |         :param endpoint_type: can be EndpointType.MODEL_BASED (without inference component)
165 |                 or EndpointType.INFERENCE_COMPONENT (with inference component)
166 |         """
167 | 
168 |         huggingface_model = HuggingFaceModel(
169 |             role=role_arn,
170 |             image_uri=llm_image,
171 |             env=config,
172 |         )
173 | 
174 |         # Deploy or update the model based on the endpoint existence
175 |         huggingface_model.deploy(
176 |             instance_type=gpu_instance_type,
177 |             initial_instance_count=1,
178 |             endpoint_name=endpoint_name,
179 |             update_endpoint=update_endpoint,
180 |             resources=resources,
181 |             tags=[{"Key": "task", "Value": "model_task"}],
182 |             endpoint_type=endpoint_type,
183 |             container_startup_health_check_timeout=900,
184 |         )
185 | 


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/aws/roles/create_execution_role.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | from loguru import logger
 5 | 
 6 | try:
 7 |     import boto3
 8 | except ModuleNotFoundError:
 9 |     logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.")
10 | 
11 | from llm_engineering.settings import settings
12 | 
13 | 
14 | def create_sagemaker_execution_role(role_name: str):
15 |     assert settings.AWS_REGION, "AWS_REGION is not set."
16 |     assert settings.AWS_ACCESS_KEY, "AWS_ACCESS_KEY is not set."
17 |     assert settings.AWS_SECRET_KEY, "AWS_SECRET_KEY is not set."
18 | 
19 |     # Create IAM client
20 |     iam = boto3.client(
21 |         "iam",
22 |         region_name=settings.AWS_REGION,
23 |         aws_access_key_id=settings.AWS_ACCESS_KEY,
24 |         aws_secret_access_key=settings.AWS_SECRET_KEY,
25 |     )
26 | 
27 |     # Define the trust relationship policy
28 |     trust_relationship = {
29 |         "Version": "2012-10-17",
30 |         "Statement": [
31 |             {"Effect": "Allow", "Principal": {"Service": "sagemaker.amazonaws.com"}, "Action": "sts:AssumeRole"}
32 |         ],
33 |     }
34 | 
35 |     try:
36 |         # Create the IAM role
37 |         role = iam.create_role(
38 |             RoleName=role_name,
39 |             AssumeRolePolicyDocument=json.dumps(trust_relationship),
40 |             Description="Execution role for SageMaker",
41 |         )
42 | 
43 |         # Attach necessary policies
44 |         policies = [
45 |             "arn:aws:iam::aws:policy/AmazonSageMakerFullAccess",
46 |             "arn:aws:iam::aws:policy/AmazonS3FullAccess",
47 |             "arn:aws:iam::aws:policy/CloudWatchLogsFullAccess",
48 |             "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess",
49 |         ]
50 | 
51 |         for policy in policies:
52 |             iam.attach_role_policy(RoleName=role_name, PolicyArn=policy)
53 | 
54 |         logger.info(f"Role '{role_name}' created successfully.")
55 |         logger.info(f"Role ARN: {role['Role']['Arn']}")
56 | 
57 |         return role["Role"]["Arn"]
58 | 
59 |     except iam.exceptions.EntityAlreadyExistsException:
60 |         logger.warning(f"Role '{role_name}' already exists. Fetching its ARN...")
61 |         role = iam.get_role(RoleName=role_name)
62 | 
63 |         return role["Role"]["Arn"]
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     role_arn = create_sagemaker_execution_role("SageMakerExecutionRoleLLM")
68 |     logger.info(role_arn)
69 | 
70 |     # Save the role ARN to a file
71 |     with Path("sagemaker_execution_role.json").open("w") as f:
72 |         json.dump({"RoleArn": role_arn}, f)
73 | 
74 |     logger.info("Role ARN saved to 'sagemaker_execution_role.json'")
75 | 


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/aws/roles/create_sagemaker_role.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | from loguru import logger
 5 | 
 6 | try:
 7 |     import boto3
 8 | except ModuleNotFoundError:
 9 |     logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.")
10 | 
11 | from llm_engineering.settings import settings
12 | 
13 | 
14 | def create_sagemaker_user(username: str):
15 |     assert settings.AWS_REGION, "AWS_REGION is not set."
16 |     assert settings.AWS_ACCESS_KEY, "AWS_ACCESS_KEY is not set."
17 |     assert settings.AWS_SECRET_KEY, "AWS_SECRET_KEY is not set."
18 | 
19 |     # Create IAM client
20 |     iam = boto3.client(
21 |         "iam",
22 |         region_name=settings.AWS_REGION,
23 |         aws_access_key_id=settings.AWS_ACCESS_KEY,
24 |         aws_secret_access_key=settings.AWS_SECRET_KEY,
25 |     )
26 | 
27 |     # Create user
28 |     iam.create_user(UserName=username)
29 | 
30 |     # Attach necessary policies
31 |     policies = [
32 |         "arn:aws:iam::aws:policy/AmazonSageMakerFullAccess",
33 |         "arn:aws:iam::aws:policy/AWSCloudFormationFullAccess",
34 |         "arn:aws:iam::aws:policy/IAMFullAccess",
35 |         "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess",
36 |         "arn:aws:iam::aws:policy/AmazonS3FullAccess",
37 |     ]
38 | 
39 |     for policy in policies:
40 |         iam.attach_user_policy(UserName=username, PolicyArn=policy)
41 | 
42 |     # Create access key
43 |     response = iam.create_access_key(UserName=username)
44 |     access_key = response["AccessKey"]
45 | 
46 |     logger.info(f"User '{username}' successfully created.")
47 |     logger.info("Access Key ID and Secret Access Key successfully created.")
48 | 
49 |     return {"AccessKeyId": access_key["AccessKeyId"], "SecretAccessKey": access_key["SecretAccessKey"]}
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     new_user = create_sagemaker_user("sagemaker-deployer")
54 | 
55 |     with Path("sagemaker_user_credentials.json").open("w") as f:
56 |         json.dump(new_user, f)
57 | 
58 | logger.info("Credentials saved to 'sagemaker_user_credentials.json'")
59 | 


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/db/mongo.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | from pymongo import MongoClient
 3 | from pymongo.errors import ConnectionFailure
 4 | 
 5 | from llm_engineering.settings import settings
 6 | 
 7 | 
 8 | class MongoDatabaseConnector:
 9 |     _instance: MongoClient | None = None
10 | 
11 |     def __new__(cls, *args, **kwargs) -> MongoClient:
12 |         if cls._instance is None:
13 |             try:
14 |                 cls._instance = MongoClient(settings.DATABASE_HOST)
15 |             except ConnectionFailure as e:
16 |                 logger.error(f"Couldn't connect to the database: {e!s}")
17 | 
18 |                 raise
19 | 
20 |         logger.info(f"Connection to MongoDB with URI successful: {settings.DATABASE_HOST}")
21 | 
22 |         return cls._instance
23 | 
24 | 
25 | connection = MongoDatabaseConnector()
26 | 


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/db/qdrant.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | from qdrant_client import QdrantClient
 3 | from qdrant_client.http.exceptions import UnexpectedResponse
 4 | 
 5 | from llm_engineering.settings import settings
 6 | 
 7 | 
 8 | class QdrantDatabaseConnector:
 9 |     _instance: QdrantClient | None = None
10 | 
11 |     def __new__(cls, *args, **kwargs) -> QdrantClient:
12 |         if cls._instance is None:
13 |             try:
14 |                 if settings.USE_QDRANT_CLOUD:
15 |                     cls._instance = QdrantClient(
16 |                         url=settings.QDRANT_CLOUD_URL,
17 |                         api_key=settings.QDRANT_APIKEY,
18 |                     )
19 | 
20 |                     uri = settings.QDRANT_CLOUD_URL
21 |                 else:
22 |                     cls._instance = QdrantClient(
23 |                         host=settings.QDRANT_DATABASE_HOST,
24 |                         port=settings.QDRANT_DATABASE_PORT,
25 |                     )
26 | 
27 |                     uri = f"{settings.QDRANT_DATABASE_HOST}:{settings.QDRANT_DATABASE_PORT}"
28 | 
29 |                 logger.info(f"Connection to Qdrant DB with URI successful: {uri}")
30 |             except UnexpectedResponse:
31 |                 logger.exception(
32 |                     "Couldn't connect to Qdrant.",
33 |                     host=settings.QDRANT_DATABASE_HOST,
34 |                     port=settings.QDRANT_DATABASE_PORT,
35 |                     url=settings.QDRANT_CLOUD_URL,
36 |                 )
37 | 
38 |                 raise
39 | 
40 |         return cls._instance
41 | 
42 | 
43 | connection = QdrantDatabaseConnector()
44 | 


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/files_io.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | class JsonFileManager:
 6 |     @classmethod
 7 |     def read(cls, filename: str | Path) -> list:
 8 |         file_path: Path = Path(filename)
 9 | 
10 |         try:
11 |             with file_path.open("r") as file:
12 |                 return json.load(file)
13 |         except FileNotFoundError:
14 |             raise FileNotFoundError(f"File '{file_path=}' does not exist.") from None
15 |         except json.JSONDecodeError as e:
16 |             raise json.JSONDecodeError(
17 |                 msg=f"File '{file_path=}' is not properly formatted as JSON.",
18 |                 doc=e.doc,
19 |                 pos=e.pos,
20 |             ) from None
21 | 
22 |     @classmethod
23 |     def write(cls, filename: str | Path, data: list | dict) -> Path:
24 |         file_path: Path = Path(filename)
25 |         file_path = file_path.resolve().absolute()
26 |         file_path.parent.mkdir(parents=True, exist_ok=True)
27 | 
28 |         with file_path.open("w") as file:
29 |             json.dump(data, file, indent=4)
30 | 
31 |         return file_path
32 | 


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/inference_pipeline_api.py:
--------------------------------------------------------------------------------
 1 | import opik
 2 | from fastapi import FastAPI, HTTPException
 3 | from opik import opik_context
 4 | from pydantic import BaseModel
 5 | 
 6 | from llm_engineering import settings
 7 | from llm_engineering.application.rag.retriever import ContextRetriever
 8 | from llm_engineering.application.utils import misc
 9 | from llm_engineering.domain.embedded_chunks import EmbeddedChunk
10 | from llm_engineering.infrastructure.opik_utils import configure_opik
11 | from llm_engineering.model.inference import InferenceExecutor, LLMInferenceSagemakerEndpoint
12 | 
13 | configure_opik()
14 | 
15 | app = FastAPI()
16 | 
17 | 
18 | class QueryRequest(BaseModel):
19 |     query: str
20 | 
21 | 
22 | class QueryResponse(BaseModel):
23 |     answer: str
24 | 
25 | 
26 | @opik.track
27 | def call_llm_service(query: str, context: str | None) -> str:
28 |     llm = LLMInferenceSagemakerEndpoint(
29 |         endpoint_name=settings.SAGEMAKER_ENDPOINT_INFERENCE, inference_component_name=None
30 |     )
31 |     answer = InferenceExecutor(llm, query, context).execute()
32 | 
33 |     return answer
34 | 
35 | 
36 | @opik.track
37 | def rag(query: str) -> str:
38 |     retriever = ContextRetriever(mock=False)
39 |     documents = retriever.search(query, k=3)
40 |     context = EmbeddedChunk.to_context(documents)
41 | 
42 |     answer = call_llm_service(query, context)
43 | 
44 |     opik_context.update_current_trace(
45 |         tags=["rag"],
46 |         metadata={
47 |             "model_id": settings.HF_MODEL_ID,
48 |             "embedding_model_id": settings.TEXT_EMBEDDING_MODEL_ID,
49 |             "temperature": settings.TEMPERATURE_INFERENCE,
50 |             "query_tokens": misc.compute_num_tokens(query),
51 |             "context_tokens": misc.compute_num_tokens(context),
52 |             "answer_tokens": misc.compute_num_tokens(answer),
53 |         },
54 |     )
55 | 
56 |     return answer
57 | 
58 | 
59 | @app.post("/rag", response_model=QueryResponse)
60 | async def rag_endpoint(request: QueryRequest):
61 |     try:
62 |         answer = rag(query=request.query)
63 | 
64 |         return {"answer": answer}
65 |     except Exception as e:
66 |         raise HTTPException(status_code=500, detail=str(e)) from e
67 | 


--------------------------------------------------------------------------------
/llm_engineering/infrastructure/opik_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import opik
 4 | from loguru import logger
 5 | from opik.configurator.configure import OpikConfigurator
 6 | 
 7 | from llm_engineering import settings
 8 | 
 9 | 
10 | def configure_opik() -> None:
11 |     if settings.COMET_API_KEY and settings.COMET_PROJECT:
12 |         try:
13 |             client = OpikConfigurator(api_key=settings.COMET_API_KEY)
14 |             default_workspace = client._get_default_workspace()
15 |         except Exception:
16 |             logger.warning("Default workspace not found. Setting workspace to None and enabling interactive mode.")
17 |             default_workspace = None
18 | 
19 |         os.environ["OPIK_PROJECT_NAME"] = settings.COMET_PROJECT
20 | 
21 |         opik.configure(api_key=settings.COMET_API_KEY, workspace=default_workspace, use_local=False, force=True)
22 |         logger.info("Opik configured successfully.")
23 |     else:
24 |         logger.warning(
25 |             "COMET_API_KEY and COMET_PROJECT are not set. Set them to enable prompt monitoring with Opik (powered by Comet ML)."
26 |         )
27 | 


--------------------------------------------------------------------------------
/llm_engineering/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/model/__init__.py


--------------------------------------------------------------------------------
/llm_engineering/model/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/model/evaluation/__init__.py


--------------------------------------------------------------------------------
/llm_engineering/model/evaluation/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.43.3
2 | datasets==2.20.0
3 | vllm==0.6.1.post2
4 | tqdm==4.66.4
5 | openai==1.55.3


--------------------------------------------------------------------------------
/llm_engineering/model/evaluation/sagemaker.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from huggingface_hub import HfApi
 4 | from loguru import logger
 5 | 
 6 | try:
 7 |     from sagemaker.huggingface import HuggingFaceProcessor
 8 | except ModuleNotFoundError:
 9 |     logger.warning("Couldn't load SageMaker imports. Run 'poetry install --with aws' to support AWS.")
10 | 
11 | from llm_engineering import settings
12 | 
13 | evaluation_dir = Path(__file__).resolve().parent
14 | evaluation_requirements_path = evaluation_dir / "requirements.txt"
15 | 
16 | 
17 | def run_evaluation_on_sagemaker(is_dummy: bool = True) -> None:
18 |     assert settings.HUGGINGFACE_ACCESS_TOKEN, "Hugging Face access token is required."
19 |     assert settings.OPENAI_API_KEY, "OpenAI API key is required."
20 |     assert settings.AWS_ARN_ROLE, "AWS ARN role is required."
21 | 
22 |     if not evaluation_dir.exists():
23 |         raise FileNotFoundError(f"The directory {evaluation_dir} does not exist.")
24 |     if not evaluation_requirements_path.exists():
25 |         raise FileNotFoundError(f"The file {evaluation_requirements_path} does not exist.")
26 | 
27 |     api = HfApi()
28 |     user_info = api.whoami(token=settings.HUGGINGFACE_ACCESS_TOKEN)
29 |     huggingface_user = user_info["name"]
30 |     logger.info(f"Current Hugging Face user: {huggingface_user}")
31 | 
32 |     env = {
33 |         "HUGGING_FACE_HUB_TOKEN": settings.HUGGINGFACE_ACCESS_TOKEN,
34 |         "OPENAI_API_KEY": settings.OPENAI_API_KEY,
35 |         "DATASET_HUGGINGFACE_WORKSPACE": huggingface_user,
36 |         "MODEL_HUGGINGFACE_WORKSPACE": huggingface_user,
37 |     }
38 |     if is_dummy:
39 |         env["IS_DUMMY"] = "True"
40 | 
41 |     # Initialize the HuggingFaceProcessor
42 |     hfp = HuggingFaceProcessor(
43 |         role=settings.AWS_ARN_ROLE,
44 |         instance_count=1,
45 |         instance_type="ml.g5.2xlarge",
46 |         transformers_version="4.36",
47 |         pytorch_version="2.1",
48 |         py_version="py310",
49 |         base_job_name="evaluate-llm-twin",
50 |         env=env,
51 |     )
52 | 
53 |     # Run the processing job
54 |     hfp.run(
55 |         code="evaluate.py",
56 |         source_dir=str(evaluation_dir),
57 |     )
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     run_evaluation_on_sagemaker()
62 | 


--------------------------------------------------------------------------------
/llm_engineering/model/finetuning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/llm_engineering/model/finetuning/__init__.py


--------------------------------------------------------------------------------
/llm_engineering/model/finetuning/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.33.0
 2 | torch==2.4.0
 3 | transformers==4.43.3
 4 | datasets==2.20.0
 5 | peft==0.12.0
 6 | trl==0.9.6
 7 | bitsandbytes==0.43.3
 8 | comet-ml==3.44.3
 9 | flash-attn==2.3.6
10 | unsloth==2024.9.post2
11 | 


--------------------------------------------------------------------------------
/llm_engineering/model/finetuning/sagemaker.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from huggingface_hub import HfApi
 4 | from loguru import logger
 5 | 
 6 | try:
 7 |     from sagemaker.huggingface import HuggingFace
 8 | except ModuleNotFoundError:
 9 |     logger.warning("Couldn't load SageMaker imports. Run 'poetry install --with aws' to support AWS.")
10 | 
11 | from llm_engineering.settings import settings
12 | 
13 | finetuning_dir = Path(__file__).resolve().parent
14 | finetuning_requirements_path = finetuning_dir / "requirements.txt"
15 | 
16 | 
17 | def run_finetuning_on_sagemaker(
18 |     finetuning_type: str = "sft",
19 |     num_train_epochs: int = 3,
20 |     per_device_train_batch_size: int = 2,
21 |     learning_rate: float = 3e-4,
22 |     dataset_huggingface_workspace: str = "mlabonne",
23 |     is_dummy: bool = False,
24 | ) -> None:
25 |     assert settings.HUGGINGFACE_ACCESS_TOKEN, "Hugging Face access token is required."
26 |     assert settings.AWS_ARN_ROLE, "AWS ARN role is required."
27 | 
28 |     if not finetuning_dir.exists():
29 |         raise FileNotFoundError(f"The directory {finetuning_dir} does not exist.")
30 |     if not finetuning_requirements_path.exists():
31 |         raise FileNotFoundError(f"The file {finetuning_requirements_path} does not exist.")
32 | 
33 |     api = HfApi()
34 |     user_info = api.whoami(token=settings.HUGGINGFACE_ACCESS_TOKEN)
35 |     huggingface_user = user_info["name"]
36 |     logger.info(f"Current Hugging Face user: {huggingface_user}")
37 | 
38 |     hyperparameters = {
39 |         "finetuning_type": finetuning_type,
40 |         "num_train_epochs": num_train_epochs,
41 |         "per_device_train_batch_size": per_device_train_batch_size,
42 |         "learning_rate": learning_rate,
43 |         "dataset_huggingface_workspace": dataset_huggingface_workspace,
44 |         "model_output_huggingface_workspace": huggingface_user,
45 |     }
46 |     if is_dummy:
47 |         hyperparameters["is_dummy"] = True
48 | 
49 |     # Create the HuggingFace SageMaker estimator
50 |     huggingface_estimator = HuggingFace(
51 |         entry_point="finetune.py",
52 |         source_dir=str(finetuning_dir),
53 |         instance_type="ml.g5.2xlarge",
54 |         instance_count=1,
55 |         role=settings.AWS_ARN_ROLE,
56 |         transformers_version="4.36",
57 |         pytorch_version="2.1",
58 |         py_version="py310",
59 |         hyperparameters=hyperparameters,
60 |         requirements_file=finetuning_requirements_path,
61 |         environment={
62 |             "HUGGING_FACE_HUB_TOKEN": settings.HUGGINGFACE_ACCESS_TOKEN,
63 |             "COMET_API_KEY": settings.COMET_API_KEY,
64 |             "COMET_PROJECT_NAME": settings.COMET_PROJECT,
65 |         },
66 |     )
67 | 
68 |     # Start the training job on SageMaker.
69 |     huggingface_estimator.fit()
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     run_finetuning_on_sagemaker()
74 | 


--------------------------------------------------------------------------------
/llm_engineering/model/inference/__init__.py:
--------------------------------------------------------------------------------
1 | from .inference import LLMInferenceSagemakerEndpoint
2 | from .run import InferenceExecutor
3 | 
4 | __all__ = ["LLMInferenceSagemakerEndpoint", "InferenceExecutor"]
5 | 


--------------------------------------------------------------------------------
/llm_engineering/model/inference/inference.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any, Dict, Optional
 3 | 
 4 | from loguru import logger
 5 | 
 6 | try:
 7 |     import boto3
 8 | except ModuleNotFoundError:
 9 |     logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.")
10 | 
11 | 
12 | from llm_engineering.domain.inference import Inference
13 | from llm_engineering.settings import settings
14 | 
15 | 
16 | class LLMInferenceSagemakerEndpoint(Inference):
17 |     """
18 |     Class for performing inference using a SageMaker endpoint for LLM schemas.
19 |     """
20 | 
21 |     def __init__(
22 |         self,
23 |         endpoint_name: str,
24 |         default_payload: Optional[Dict[str, Any]] = None,
25 |         inference_component_name: Optional[str] = None,
26 |     ) -> None:
27 |         super().__init__()
28 | 
29 |         self.client = boto3.client(
30 |             "sagemaker-runtime",
31 |             region_name=settings.AWS_REGION,
32 |             aws_access_key_id=settings.AWS_ACCESS_KEY,
33 |             aws_secret_access_key=settings.AWS_SECRET_KEY,
34 |         )
35 |         self.endpoint_name = endpoint_name
36 |         self.payload = default_payload if default_payload else self._default_payload()
37 |         self.inference_component_name = inference_component_name
38 | 
39 |     def _default_payload(self) -> Dict[str, Any]:
40 |         """
41 |         Generates the default payload for the inference request.
42 | 
43 |         Returns:
44 |             dict: The default payload.
45 |         """
46 | 
47 |         return {
48 |             "inputs": "How is the weather?",
49 |             "parameters": {
50 |                 "max_new_tokens": settings.MAX_NEW_TOKENS_INFERENCE,
51 |                 "top_p": settings.TOP_P_INFERENCE,
52 |                 "temperature": settings.TEMPERATURE_INFERENCE,
53 |                 "return_full_text": False,
54 |             },
55 |         }
56 | 
57 |     def set_payload(self, inputs: str, parameters: Optional[Dict[str, Any]] = None) -> None:
58 |         """
59 |         Sets the payload for the inference request.
60 | 
61 |         Args:
62 |             inputs (str): The input text for the inference.
63 |             parameters (dict, optional): Additional parameters for the inference. Defaults to None.
64 |         """
65 | 
66 |         self.payload["inputs"] = inputs
67 |         if parameters:
68 |             self.payload["parameters"].update(parameters)
69 | 
70 |     def inference(self) -> Dict[str, Any]:
71 |         """
72 |         Performs the inference request using the SageMaker endpoint.
73 | 
74 |         Returns:
75 |             dict: The response from the inference request.
76 |         Raises:
77 |             Exception: If an error occurs during the inference request.
78 |         """
79 | 
80 |         try:
81 |             logger.info("Inference request sent.")
82 |             invoke_args = {
83 |                 "EndpointName": self.endpoint_name,
84 |                 "ContentType": "application/json",
85 |                 "Body": json.dumps(self.payload),
86 |             }
87 |             if self.inference_component_name not in ["None", None]:
88 |                 invoke_args["InferenceComponentName"] = self.inference_component_name
89 |             response = self.client.invoke_endpoint(**invoke_args)
90 |             response_body = response["Body"].read().decode("utf8")
91 | 
92 |             return json.loads(response_body)
93 | 
94 |         except Exception:
95 |             logger.exception("SageMaker inference failed.")
96 | 
97 |             raise
98 | 


--------------------------------------------------------------------------------
/llm_engineering/model/inference/run.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from llm_engineering.domain.inference import Inference
 4 | from llm_engineering.settings import settings
 5 | 
 6 | 
 7 | class InferenceExecutor:
 8 |     def __init__(
 9 |         self,
10 |         llm: Inference,
11 |         query: str,
12 |         context: str | None = None,
13 |         prompt: str | None = None,
14 |     ) -> None:
15 |         self.llm = llm
16 |         self.query = query
17 |         self.context = context if context else ""
18 | 
19 |         if prompt is None:
20 |             self.prompt = """
21 | You are a content creator. Write what the user asked you to while using the provided context as the primary source of information for the content.
22 | User query: {query}
23 | Context: {context}
24 |             """
25 |         else:
26 |             self.prompt = prompt
27 | 
28 |     def execute(self) -> str:
29 |         self.llm.set_payload(
30 |             inputs=self.prompt.format(query=self.query, context=self.context),
31 |             parameters={
32 |                 "max_new_tokens": settings.MAX_NEW_TOKENS_INFERENCE,
33 |                 "repetition_penalty": 1.1,
34 |                 "temperature": settings.TEMPERATURE_INFERENCE,
35 |             },
36 |         )
37 |         answer = self.llm.inference()[0]["generated_text"]
38 | 
39 |         return answer
40 | 


--------------------------------------------------------------------------------
/llm_engineering/model/inference/test.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | 
 3 | from llm_engineering.model.inference.inference import LLMInferenceSagemakerEndpoint
 4 | from llm_engineering.model.inference.run import InferenceExecutor
 5 | from llm_engineering.settings import settings
 6 | 
 7 | if __name__ == "__main__":
 8 |     text = "Write me a post about AWS SageMaker inference endpoints."
 9 |     logger.info(f"Running inference for text: '{text}'")
10 |     llm = LLMInferenceSagemakerEndpoint(
11 |         endpoint_name=settings.SAGEMAKER_ENDPOINT_INFERENCE, inference_component_name=None
12 |     )
13 |     answer = InferenceExecutor(llm, text).execute()
14 | 
15 |     logger.info(f"Answer: '{answer}'")
16 | 


--------------------------------------------------------------------------------
/llm_engineering/model/utils.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | 
 3 | try:
 4 |     import boto3
 5 |     from botocore.exceptions import ClientError
 6 | except ModuleNotFoundError:
 7 |     logger.warning("Couldn't load AWS or SageMaker imports. Run 'poetry install --with aws' to support AWS.")
 8 | 
 9 | from llm_engineering.settings import settings
10 | 
11 | 
12 | class ResourceManager:
13 |     def __init__(self) -> None:
14 |         self.sagemaker_client = boto3.client(
15 |             "sagemaker",
16 |             region_name=settings.AWS_REGION,
17 |             aws_access_key_id=settings.AWS_ACCESS_KEY,
18 |             aws_secret_access_key=settings.AWS_SECRET_KEY,
19 |         )
20 | 
21 |     def endpoint_config_exists(self, endpoint_config_name: str) -> bool:
22 |         """Check if the SageMaker endpoint configuration exists."""
23 |         try:
24 |             self.sagemaker_client.describe_endpoint_config(EndpointConfigName=endpoint_config_name)
25 |             logger.info(f"Endpoint configuration '{endpoint_config_name}' exists.")
26 |             return True
27 |         except ClientError:
28 |             logger.info(f"Endpoint configuration '{endpoint_config_name}' does not exist.")
29 |             return False
30 | 
31 |     def endpoint_exists(self, endpoint_name: str) -> bool:
32 |         """Check if the SageMaker endpoint exists."""
33 |         try:
34 |             self.sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
35 |             logger.info(f"Endpoint '{endpoint_name}' exists.")
36 |             return True
37 |         except self.sagemaker_client.exceptions.ResourceNotFoundException:
38 |             logger.info(f"Endpoint '{endpoint_name}' does not exist.")
39 |             return False
40 | 


--------------------------------------------------------------------------------
/llm_engineering/settings.py:
--------------------------------------------------------------------------------
  1 | from loguru import logger
  2 | from pydantic_settings import BaseSettings, SettingsConfigDict
  3 | from zenml.client import Client
  4 | from zenml.exceptions import EntityExistsError
  5 | 
  6 | 
  7 | class Settings(BaseSettings):
  8 |     model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
  9 | 
 10 |     # --- Required settings even when working locally. ---
 11 | 
 12 |     # OpenAI API
 13 |     OPENAI_MODEL_ID: str = "gpt-4o-mini"
 14 |     OPENAI_API_KEY: str | None = None
 15 | 
 16 |     # Huggingface API
 17 |     HUGGINGFACE_ACCESS_TOKEN: str | None = None
 18 | 
 19 |     # Comet ML (during training)
 20 |     COMET_API_KEY: str | None = None
 21 |     COMET_PROJECT: str = "twin"
 22 | 
 23 |     # --- Required settings when deploying the code. ---
 24 |     # --- Otherwise, default values values work fine. ---
 25 | 
 26 |     # MongoDB database
 27 |     DATABASE_HOST: str = "mongodb://llm_engineering:llm_engineering@127.0.0.1:27017"
 28 |     DATABASE_NAME: str = "twin"
 29 | 
 30 |     # Qdrant vector database
 31 |     USE_QDRANT_CLOUD: bool = False
 32 |     QDRANT_DATABASE_HOST: str = "localhost"
 33 |     QDRANT_DATABASE_PORT: int = 6333
 34 |     QDRANT_CLOUD_URL: str = "str"
 35 |     QDRANT_APIKEY: str | None = None
 36 | 
 37 |     # AWS Authentication
 38 |     AWS_REGION: str = "eu-central-1"
 39 |     AWS_ACCESS_KEY: str | None = None
 40 |     AWS_SECRET_KEY: str | None = None
 41 |     AWS_ARN_ROLE: str | None = None
 42 | 
 43 |     # --- Optional settings used to tweak the code. ---
 44 | 
 45 |     # AWS SageMaker
 46 |     HF_MODEL_ID: str = "mlabonne/TwinLlama-3.1-8B-DPO"
 47 |     GPU_INSTANCE_TYPE: str = "ml.g5.2xlarge"
 48 |     SM_NUM_GPUS: int = 1
 49 |     MAX_INPUT_LENGTH: int = 2048
 50 |     MAX_TOTAL_TOKENS: int = 4096
 51 |     MAX_BATCH_TOTAL_TOKENS: int = 4096
 52 |     COPIES: int = 1  # Number of replicas
 53 |     GPUS: int = 1  # Number of GPUs
 54 |     CPUS: int = 2  # Number of CPU cores
 55 | 
 56 |     SAGEMAKER_ENDPOINT_CONFIG_INFERENCE: str = "twin"
 57 |     SAGEMAKER_ENDPOINT_INFERENCE: str = "twin"
 58 |     TEMPERATURE_INFERENCE: float = 0.01
 59 |     TOP_P_INFERENCE: float = 0.9
 60 |     MAX_NEW_TOKENS_INFERENCE: int = 150
 61 | 
 62 |     # RAG
 63 |     TEXT_EMBEDDING_MODEL_ID: str = "sentence-transformers/all-MiniLM-L6-v2"
 64 |     RERANKING_CROSS_ENCODER_MODEL_ID: str = "cross-encoder/ms-marco-MiniLM-L-4-v2"
 65 |     RAG_MODEL_DEVICE: str = "cpu"
 66 | 
 67 |     # LinkedIn Credentials
 68 |     LINKEDIN_USERNAME: str | None = None
 69 |     LINKEDIN_PASSWORD: str | None = None
 70 | 
 71 |     @property
 72 |     def OPENAI_MAX_TOKEN_WINDOW(self) -> int:
 73 |         official_max_token_window = {
 74 |             "gpt-3.5-turbo": 16385,
 75 |             "gpt-4-turbo": 128000,
 76 |             "gpt-4o": 128000,
 77 |             "gpt-4o-mini": 128000,
 78 |         }.get(self.OPENAI_MODEL_ID, 128000)
 79 | 
 80 |         max_token_window = int(official_max_token_window * 0.90)
 81 | 
 82 |         return max_token_window
 83 | 
 84 |     @classmethod
 85 |     def load_settings(cls) -> "Settings":
 86 |         """
 87 |         Tries to load the settings from the ZenML secret store. If the secret does not exist, it initializes the settings from the .env file and default values.
 88 | 
 89 |         Returns:
 90 |             Settings: The initialized settings object.
 91 |         """
 92 | 
 93 |         try:
 94 |             logger.info("Loading settings from the ZenML secret store.")
 95 | 
 96 |             settings_secrets = Client().get_secret("settings")
 97 |             settings = Settings(**settings_secrets.secret_values)
 98 |         except (RuntimeError, KeyError):
 99 |             logger.warning(
100 |                 "Failed to load settings from the ZenML secret store. Defaulting to loading the settings from the '.env' file."
101 |             )
102 |             settings = Settings()
103 | 
104 |         return settings
105 | 
106 |     def export(self) -> None:
107 |         """
108 |         Exports the settings to the ZenML secret store.
109 |         """
110 | 
111 |         env_vars = settings.model_dump()
112 |         for key, value in env_vars.items():
113 |             env_vars[key] = str(value)
114 | 
115 |         client = Client()
116 | 
117 |         try:
118 |             client.create_secret(name="settings", values=env_vars)
119 |         except EntityExistsError:
120 |             logger.warning(
121 |                 "Secret 'scope' already exists. Delete it manually by running 'zenml secret delete settings', before trying to recreate it."
122 |             )
123 | 
124 | 
125 | settings = Settings.load_settings()
126 | 


--------------------------------------------------------------------------------
/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | from .digital_data_etl import digital_data_etl
 2 | from .end_to_end_data import end_to_end_data
 3 | from .evaluating import evaluating
 4 | from .export_artifact_to_json import export_artifact_to_json
 5 | from .feature_engineering import feature_engineering
 6 | from .generate_datasets import generate_datasets
 7 | from .training import training
 8 | 
 9 | __all__ = [
10 |     "generate_datasets",
11 |     "end_to_end_data",
12 |     "evaluating",
13 |     "export_artifact_to_json",
14 |     "digital_data_etl",
15 |     "feature_engineering",
16 |     "training",
17 | ]
18 | 


--------------------------------------------------------------------------------
/pipelines/digital_data_etl.py:
--------------------------------------------------------------------------------
 1 | from zenml import pipeline
 2 | 
 3 | from steps.etl import crawl_links, get_or_create_user
 4 | 
 5 | 
 6 | @pipeline
 7 | def digital_data_etl(user_full_name: str, links: list[str]) -> str:
 8 |     user = get_or_create_user(user_full_name)
 9 |     last_step = crawl_links(user=user, links=links)
10 | 
11 |     return last_step.invocation_id
12 | 


--------------------------------------------------------------------------------
/pipelines/end_to_end_data.py:
--------------------------------------------------------------------------------
 1 | from zenml import pipeline
 2 | 
 3 | from .digital_data_etl import digital_data_etl
 4 | from .feature_engineering import feature_engineering
 5 | from .generate_datasets import generate_datasets
 6 | 
 7 | 
 8 | @pipeline
 9 | def end_to_end_data(
10 |     author_links: list[dict[str, str | list[str]]],
11 |     test_split_size: float = 0.1,
12 |     push_to_huggingface: bool = False,
13 |     dataset_id: str | None = None,
14 |     mock: bool = False,
15 | ) -> None:
16 |     wait_for_ids = []
17 |     for author_data in author_links:
18 |         last_step_invocation_id = digital_data_etl(
19 |             user_full_name=author_data["user_full_name"], links=author_data["links"]
20 |         )
21 | 
22 |         wait_for_ids.append(last_step_invocation_id)
23 | 
24 |     author_full_names = [author_data["user_full_name"] for author_data in author_links]
25 |     wait_for_ids = feature_engineering(author_full_names=author_full_names, wait_for=wait_for_ids)
26 | 
27 |     generate_datasets(
28 |         test_split_size=test_split_size,
29 |         push_to_huggingface=push_to_huggingface,
30 |         dataset_id=dataset_id,
31 |         mock=mock,
32 |         wait_for=wait_for_ids,
33 |     )
34 | 


--------------------------------------------------------------------------------
/pipelines/evaluating.py:
--------------------------------------------------------------------------------
 1 | from zenml import pipeline
 2 | 
 3 | from steps import evaluating as evaluating_steps
 4 | 
 5 | 
 6 | @pipeline
 7 | def evaluating(
 8 |     is_dummy: bool = False,
 9 | ) -> None:
10 |     evaluating_steps.evaluate(
11 |         is_dummy=is_dummy,
12 |     )
13 | 


--------------------------------------------------------------------------------
/pipelines/export_artifact_to_json.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from zenml import pipeline
 4 | from zenml.client import Client
 5 | 
 6 | from steps import export as export_steps
 7 | 
 8 | 
 9 | @pipeline
10 | def export_artifact_to_json(artifact_names: list[str], output_dir: Path = Path("output")) -> None:
11 |     for artifact_name in artifact_names:
12 |         artifact = Client().get_artifact_version(name_id_or_prefix=artifact_name)
13 | 
14 |         data = export_steps.serialize_artifact(artifact=artifact, artifact_name=artifact_name)
15 | 
16 |         export_steps.to_json(data=data, to_file=output_dir / f"{artifact_name}.json")
17 | 


--------------------------------------------------------------------------------
/pipelines/feature_engineering.py:
--------------------------------------------------------------------------------
 1 | from zenml import pipeline
 2 | 
 3 | from steps import feature_engineering as fe_steps
 4 | 
 5 | 
 6 | @pipeline
 7 | def feature_engineering(author_full_names: list[str], wait_for: str | list[str] | None = None) -> list[str]:
 8 |     raw_documents = fe_steps.query_data_warehouse(author_full_names, after=wait_for)
 9 | 
10 |     cleaned_documents = fe_steps.clean_documents(raw_documents)
11 |     last_step_1 = fe_steps.load_to_vector_db(cleaned_documents)
12 | 
13 |     embedded_documents = fe_steps.chunk_and_embed(cleaned_documents)
14 |     last_step_2 = fe_steps.load_to_vector_db(embedded_documents)
15 | 
16 |     return [last_step_1.invocation_id, last_step_2.invocation_id]
17 | 


--------------------------------------------------------------------------------
/pipelines/generate_datasets.py:
--------------------------------------------------------------------------------
 1 | from zenml import pipeline
 2 | 
 3 | from llm_engineering.domain.dataset import DatasetType
 4 | from steps import generate_datasets as cd_steps
 5 | 
 6 | 
 7 | @pipeline
 8 | def generate_datasets(
 9 |     dataset_type: DatasetType = DatasetType.INSTRUCTION,
10 |     test_split_size: float = 0.1,
11 |     push_to_huggingface: bool = False,
12 |     dataset_id: str | None = None,
13 |     mock: bool = False,
14 |     wait_for: str | list[str] | None = None,
15 | ) -> None:
16 |     cleaned_documents = cd_steps.query_feature_store(after=wait_for)
17 |     prompts = cd_steps.create_prompts(documents=cleaned_documents, dataset_type=dataset_type)
18 |     if dataset_type == DatasetType.INSTRUCTION:
19 |         dataset = cd_steps.generate_intruction_dataset(prompts=prompts, test_split_size=test_split_size, mock=mock)
20 |     elif dataset_type == DatasetType.PREFERENCE:
21 |         dataset = cd_steps.generate_preference_dataset(prompts=prompts, test_split_size=test_split_size, mock=mock)
22 |     else:
23 |         raise ValueError(f"Invalid dataset type: {dataset_type}")
24 | 
25 |     if push_to_huggingface:
26 |         cd_steps.push_to_huggingface(dataset=dataset, dataset_id=dataset_id)
27 | 


--------------------------------------------------------------------------------
/pipelines/training.py:
--------------------------------------------------------------------------------
 1 | from zenml import pipeline
 2 | 
 3 | from steps import training as training_steps
 4 | 
 5 | 
 6 | @pipeline
 7 | def training(
 8 |     finetuning_type: str = "sft",
 9 |     num_train_epochs: int = 3,
10 |     per_device_train_batch_size: int = 2,
11 |     learning_rate: float = 3e-4,
12 |     dataset_huggingface_workspace: str = "mlabonne",
13 |     is_dummy: bool = False,
14 | ) -> None:
15 |     training_steps.train(
16 |         finetuning_type=finetuning_type,
17 |         num_train_epochs=num_train_epochs,
18 |         per_device_train_batch_size=per_device_train_batch_size,
19 |         learning_rate=learning_rate,
20 |         dataset_huggingface_workspace=dataset_huggingface_workspace,
21 |         is_dummy=is_dummy,
22 |     )
23 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "llm-engineering"
  3 | version = "0.1.0"
  4 | description = ""
  5 | authors = ["iusztinpaul <p.b.iusztin@gmail.com>"]
  6 | license = "MIT"
  7 | readme = "README.md"
  8 | 
  9 | [tool.poetry.dependencies]
 10 | python = "~3.11"
 11 | zenml = { version = "0.74.0", extras = ["server"] }
 12 | pymongo = "^4.6.2"
 13 | click = "^8.0.1"
 14 | loguru = "^0.7.2"
 15 | rich = "^13.7.1"
 16 | numpy = "^1.26.4"
 17 | poethepoet = "0.29.0"
 18 | datasets = "^3.0.1"
 19 | torch = "2.2.2"
 20 | 
 21 | # Digital data ETL
 22 | selenium = "^4.21.0"
 23 | webdriver-manager = "^4.0.1"
 24 | beautifulsoup4 = "^4.12.3"
 25 | html2text = "^2024.2.26"
 26 | jmespath = "^1.0.1"
 27 | chromedriver-autoinstaller = "^0.6.4"
 28 | 
 29 | # Feature engineering
 30 | qdrant-client = "^1.8.0"
 31 | langchain = "^0.2.11"
 32 | sentence-transformers = "^3.0.0"
 33 | 
 34 | # RAG
 35 | langchain-openai = "^0.1.3"
 36 | jinja2 = "^3.1.4"
 37 | tiktoken = "^0.7.0"
 38 | fake-useragent = "^1.5.1"
 39 | langchain-community = "^0.2.11"
 40 | 
 41 | # Inference
 42 | fastapi = ">=0.100,<=0.110"
 43 | uvicorn = "^0.30.6"
 44 | opik = "^0.2.2"
 45 | 
 46 | 
 47 | [tool.poetry.group.dev.dependencies]
 48 | ruff = "^0.4.9"
 49 | pre-commit = "^3.7.1"
 50 | pytest = "^8.2.2"
 51 | 
 52 | 
 53 | [tool.poetry.group.aws.dependencies]
 54 | sagemaker = ">=2.232.2"
 55 | s3fs = ">2022.3.0"
 56 | aws-profile-manager = "^0.7.3"
 57 | kubernetes = "^30.1.0"
 58 | sagemaker-huggingface-inference-toolkit = "^2.4.0"
 59 | 
 60 | 
 61 | [build-system]
 62 | requires = ["poetry-core"]
 63 | build-backend = "poetry.core.masonry.api"
 64 | 
 65 | # ----------------------------------
 66 | # --- Poe the Poet Configuration ---
 67 | # ----------------------------------
 68 | 
 69 | [tool.poe.tasks]
 70 | # Data pipelines
 71 | run-digital-data-etl-alex = "echo 'It is not supported anymore.'"
 72 | run-digital-data-etl-maxime = "poetry run python -m tools.run --run-etl --no-cache --etl-config-filename digital_data_etl_maxime_labonne.yaml"
 73 | run-digital-data-etl-paul = "poetry run python -m tools.run --run-etl --no-cache --etl-config-filename digital_data_etl_paul_iusztin.yaml"
 74 | run-digital-data-etl = [
 75 |     "run-digital-data-etl-maxime",
 76 |     "run-digital-data-etl-paul",
 77 | ]
 78 | run-feature-engineering-pipeline = "poetry run python -m tools.run --no-cache --run-feature-engineering"
 79 | run-generate-instruct-datasets-pipeline = "poetry run python -m tools.run --no-cache --run-generate-instruct-datasets"
 80 | run-generate-preference-datasets-pipeline = "poetry run python -m tools.run --no-cache --run-generate-preference-datasets"
 81 | run-end-to-end-data-pipeline = "poetry run python -m tools.run --no-cache --run-end-to-end-data"
 82 | 
 83 | # Utility pipelines
 84 | run-export-artifact-to-json-pipeline = "poetry run python -m tools.run --no-cache --run-export-artifact-to-json"
 85 | run-export-data-warehouse-to-json = "poetry run python -m tools.data_warehouse --export-raw-data"
 86 | run-import-data-warehouse-from-json = "poetry run python -m tools.data_warehouse --import-raw-data"
 87 | 
 88 | # Training pipelines
 89 | run-training-pipeline = "poetry run python -m tools.run --no-cache --run-training"
 90 | run-evaluation-pipeline = "poetry run python -m tools.run --no-cache --run-evaluation"
 91 | 
 92 | # Inference
 93 | call-rag-retrieval-module = "poetry run python -m tools.rag"
 94 | 
 95 | run-inference-ml-service = "poetry run uvicorn tools.ml_service:app --host 0.0.0.0 --port 8000 --reload"
 96 | call-inference-ml-service = "curl -X POST 'http://127.0.0.1:8000/rag' -H 'Content-Type: application/json' -d '{\"query\": \"My name is Paul Iusztin. Could you draft a LinkedIn post discussing RAG systems? I am particularly interested in how RAG works and how it is integrated with vector DBs and LLMs.\"}'"
 97 | 
 98 | # Infrastructure
 99 | ## Local infrastructure
100 | local-docker-infrastructure-up = "docker compose up -d"
101 | local-docker-infrastructure-down = "docker compose stop"
102 | local-zenml-server-down = "poetry run zenml logout --local"
103 | local-infrastructure-up = [
104 |     "local-docker-infrastructure-up",
105 |     "local-zenml-server-down",
106 |     "local-zenml-server-up",
107 | ]
108 | local-infrastructure-down = [
109 |     "local-docker-infrastructure-down",
110 |     "local-zenml-server-down",
111 | ]
112 | set-local-stack = "poetry run zenml stack set default"
113 | set-aws-stack = "poetry run zenml stack set aws-stack"
114 | set-asynchronous-runs = "poetry run zenml orchestrator update aws-stack --synchronous=False"
115 | zenml-server-disconnect = "poetry run zenml disconnect"
116 | 
117 | ## Settings
118 | export-settings-to-zenml = "poetry run python -m tools.run --export-settings"
119 | delete-settings-zenml = "poetry run zenml secret delete settings"
120 | 
121 | ## SageMaker
122 | create-sagemaker-role = "poetry run python -m llm_engineering.infrastructure.aws.roles.create_sagemaker_role"
123 | create-sagemaker-execution-role = "poetry run python -m llm_engineering.infrastructure.aws.roles.create_execution_role"
124 | deploy-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.huggingface.run"
125 | test-sagemaker-endpoint = "poetry run python -m llm_engineering.model.inference.test"
126 | delete-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.delete_sagemaker_endpoint"
127 | 
128 | ## Docker
129 | build-docker-image = "docker buildx build --platform linux/amd64 -t llmtwin -f Dockerfile ."
130 | run-docker-end-to-end-data-pipeline = "docker run --rm --network host --shm-size=2g --env-file .env llmtwin poetry poe --no-cache --run-end-to-end-data"
131 | bash-docker-container = "docker run --rm -it --network host --env-file .env llmtwin bash"
132 | 
133 | # QA
134 | lint-check = "poetry run ruff check ."
135 | format-check = "poetry run ruff format --check ."
136 | lint-check-docker = "sh -c 'docker run --rm -i hadolint/hadolint < Dockerfile'"
137 | gitleaks-check = "docker run -v .:/src zricethezav/gitleaks:latest dir /src/llm_engineering"
138 | lint-fix = "poetry run ruff check --fix ."
139 | format-fix = "poetry run ruff format ."
140 | 
141 | [tool.poe.tasks.local-zenml-server-up]
142 | control.expr = "sys.platform"
143 | 
144 | [[tool.poe.tasks.local-zenml-server-up.switch]]
145 | case = "darwin"
146 | env = { OBJC_DISABLE_INITIALIZE_FORK_SAFETY = "YES" }
147 | cmd = "poetry run zenml login --local"
148 | 
149 | [[tool.poe.tasks.local-zenml-server-up.switch]]
150 | case = "win32"
151 | cmd = "poetry run zenml login --local --blocking"
152 | 
153 | [[tool.poe.tasks.local-zenml-server-up.switch]]
154 | cmd = "poetry run zenml login --local"
155 | 
156 | # Tests
157 | [tool.poe.tasks.test]
158 | cmd = "poetry run pytest tests/"
159 | env = { ENV_FILE = ".env.testing" }
160 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
 1 | line-length = 120
 2 | target-version = "py311"
 3 | extend-exclude = [
 4 |     ".github",
 5 |     "graphql_client",
 6 |     "graphql_schemas"
 7 | ]
 8 | 
 9 | [lint]
10 | extend-select = [
11 |   "I",
12 |   "B",
13 |   "G",
14 |   "T20",
15 |   "PTH",
16 |   "RUF"
17 | ]
18 | 
19 | [lint.isort]
20 | case-sensitive = true
21 | 
22 | [lint.pydocstyle]
23 | convention = "google"


--------------------------------------------------------------------------------
/steps/__init__.py:
--------------------------------------------------------------------------------
1 | from . import etl, evaluating, export, feature_engineering, generate_datasets, training
2 | 
3 | __all__ = ["generate_datasets", "export", "etl", "feature_engineering", "training", "evaluating"]
4 | 


--------------------------------------------------------------------------------
/steps/etl/__init__.py:
--------------------------------------------------------------------------------
1 | from .crawl_links import crawl_links
2 | from .get_or_create_user import get_or_create_user
3 | 
4 | __all__ = ["crawl_links", "get_or_create_user"]
5 | 


--------------------------------------------------------------------------------
/steps/etl/crawl_links.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse
 2 | 
 3 | from loguru import logger
 4 | from tqdm import tqdm
 5 | from typing_extensions import Annotated
 6 | from zenml import get_step_context, step
 7 | 
 8 | from llm_engineering.application.crawlers.dispatcher import CrawlerDispatcher
 9 | from llm_engineering.domain.documents import UserDocument
10 | 
11 | 
12 | @step
13 | def crawl_links(user: UserDocument, links: list[str]) -> Annotated[list[str], "crawled_links"]:
14 |     dispatcher = CrawlerDispatcher.build().register_linkedin().register_medium().register_github()
15 | 
16 |     logger.info(f"Starting to crawl {len(links)} link(s).")
17 | 
18 |     metadata = {}
19 |     successfull_crawls = 0
20 |     for link in tqdm(links):
21 |         successfull_crawl, crawled_domain = _crawl_link(dispatcher, link, user)
22 |         successfull_crawls += successfull_crawl
23 | 
24 |         metadata = _add_to_metadata(metadata, crawled_domain, successfull_crawl)
25 | 
26 |     step_context = get_step_context()
27 |     step_context.add_output_metadata(output_name="crawled_links", metadata=metadata)
28 | 
29 |     logger.info(f"Successfully crawled {successfull_crawls} / {len(links)} links.")
30 | 
31 |     return links
32 | 
33 | 
34 | def _crawl_link(dispatcher: CrawlerDispatcher, link: str, user: UserDocument) -> tuple[bool, str]:
35 |     crawler = dispatcher.get_crawler(link)
36 |     crawler_domain = urlparse(link).netloc
37 | 
38 |     try:
39 |         crawler.extract(link=link, user=user)
40 | 
41 |         return (True, crawler_domain)
42 |     except Exception as e:
43 |         logger.error(f"An error occurred while crowling: {e!s}")
44 | 
45 |         return (False, crawler_domain)
46 | 
47 | 
48 | def _add_to_metadata(metadata: dict, domain: str, successfull_crawl: bool) -> dict:
49 |     if domain not in metadata:
50 |         metadata[domain] = {}
51 |     metadata[domain]["successful"] = metadata[domain].get("successful", 0) + successfull_crawl
52 |     metadata[domain]["total"] = metadata[domain].get("total", 0) + 1
53 | 
54 |     return metadata
55 | 


--------------------------------------------------------------------------------
/steps/etl/get_or_create_user.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | from typing_extensions import Annotated
 3 | from zenml import get_step_context, step
 4 | 
 5 | from llm_engineering.application import utils
 6 | from llm_engineering.domain.documents import UserDocument
 7 | 
 8 | 
 9 | @step
10 | def get_or_create_user(user_full_name: str) -> Annotated[UserDocument, "user"]:
11 |     logger.info(f"Getting or creating user: {user_full_name}")
12 | 
13 |     first_name, last_name = utils.split_user_full_name(user_full_name)
14 | 
15 |     user = UserDocument.get_or_create(first_name=first_name, last_name=last_name)
16 | 
17 |     step_context = get_step_context()
18 |     step_context.add_output_metadata(output_name="user", metadata=_get_metadata(user_full_name, user))
19 | 
20 |     return user
21 | 
22 | 
23 | def _get_metadata(user_full_name: str, user: UserDocument) -> dict:
24 |     return {
25 |         "query": {
26 |             "user_full_name": user_full_name,
27 |         },
28 |         "retrieved": {
29 |             "user_id": str(user.id),
30 |             "first_name": user.first_name,
31 |             "last_name": user.last_name,
32 |         },
33 |     }
34 | 


--------------------------------------------------------------------------------
/steps/evaluating/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluate import evaluate
2 | 
3 | __all__ = ["evaluate"]
4 | 


--------------------------------------------------------------------------------
/steps/evaluating/evaluate.py:
--------------------------------------------------------------------------------
 1 | from zenml import step
 2 | 
 3 | from llm_engineering.model.evaluation.sagemaker import run_evaluation_on_sagemaker
 4 | 
 5 | 
 6 | @step
 7 | def evaluate(
 8 |     is_dummy: bool = False,
 9 | ) -> None:
10 |     run_evaluation_on_sagemaker(
11 |         is_dummy=is_dummy,
12 |     )
13 | 


--------------------------------------------------------------------------------
/steps/export/__init__.py:
--------------------------------------------------------------------------------
1 | from .serialize_artifact import serialize_artifact
2 | from .to_json import to_json
3 | 
4 | __all__ = ["to_json", "serialize_artifact"]
5 | 


--------------------------------------------------------------------------------
/steps/export/serialize_artifact.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from pydantic import BaseModel
 4 | from typing_extensions import Annotated
 5 | from zenml import get_step_context, step
 6 | 
 7 | 
 8 | @step
 9 | def serialize_artifact(artifact: Any, artifact_name: str) -> Annotated[dict, "serialized_artifact"]:
10 |     serialized_artifact = _serialize_artifact(artifact)
11 | 
12 |     if serialize_artifact is None:
13 |         raise ValueError("Artifact is None")
14 |     elif not isinstance(serialized_artifact, dict):
15 |         serialized_artifact = {"artifact_data": serialized_artifact}
16 | 
17 |     step_context = get_step_context()
18 |     step_context.add_output_metadata(output_name="serialized_artifact", metadata={"artifact_name": artifact_name})
19 | 
20 |     return serialized_artifact
21 | 
22 | 
23 | def _serialize_artifact(arfifact: list | dict | BaseModel | str | int | float | bool | None):
24 |     if isinstance(arfifact, list):
25 |         return [_serialize_artifact(item) for item in arfifact]
26 |     elif isinstance(arfifact, dict):
27 |         return {key: _serialize_artifact(value) for key, value in arfifact.items()}
28 |     if isinstance(arfifact, BaseModel):
29 |         return arfifact.model_dump()
30 |     else:
31 |         return arfifact
32 | 


--------------------------------------------------------------------------------
/steps/export/to_json.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from typing_extensions import Annotated
 4 | from zenml import step
 5 | 
 6 | from llm_engineering.infrastructure.files_io import JsonFileManager
 7 | 
 8 | 
 9 | @step
10 | def to_json(
11 |     data: Annotated[dict, "serialized_artifact"],
12 |     to_file: Annotated[Path, "to_file"],
13 | ) -> Annotated[Path, "exported_file_path"]:
14 |     absolute_file_path = JsonFileManager.write(
15 |         filename=to_file,
16 |         data=data,
17 |     )
18 | 
19 |     return absolute_file_path
20 | 


--------------------------------------------------------------------------------
/steps/feature_engineering/__init__.py:
--------------------------------------------------------------------------------
 1 | from .clean import clean_documents
 2 | from .load_to_vector_db import load_to_vector_db
 3 | from .query_data_warehouse import query_data_warehouse
 4 | from .rag import chunk_and_embed
 5 | 
 6 | __all__ = [
 7 |     "clean_documents",
 8 |     "load_to_vector_db",
 9 |     "query_data_warehouse",
10 |     "chunk_and_embed",
11 | ]
12 | 


--------------------------------------------------------------------------------
/steps/feature_engineering/clean.py:
--------------------------------------------------------------------------------
 1 | from typing_extensions import Annotated
 2 | from zenml import get_step_context, step
 3 | 
 4 | from llm_engineering.application.preprocessing import CleaningDispatcher
 5 | from llm_engineering.domain.cleaned_documents import CleanedDocument
 6 | 
 7 | 
 8 | @step
 9 | def clean_documents(
10 |     documents: Annotated[list, "raw_documents"],
11 | ) -> Annotated[list, "cleaned_documents"]:
12 |     cleaned_documents = []
13 |     for document in documents:
14 |         cleaned_document = CleaningDispatcher.dispatch(document)
15 |         cleaned_documents.append(cleaned_document)
16 | 
17 |     step_context = get_step_context()
18 |     step_context.add_output_metadata(output_name="cleaned_documents", metadata=_get_metadata(cleaned_documents))
19 | 
20 |     return cleaned_documents
21 | 
22 | 
23 | def _get_metadata(cleaned_documents: list[CleanedDocument]) -> dict:
24 |     metadata = {"num_documents": len(cleaned_documents)}
25 |     for document in cleaned_documents:
26 |         category = document.get_category()
27 |         if category not in metadata:
28 |             metadata[category] = {}
29 |         if "authors" not in metadata[category]:
30 |             metadata[category]["authors"] = list()
31 | 
32 |         metadata[category]["num_documents"] = metadata[category].get("num_documents", 0) + 1
33 |         metadata[category]["authors"].append(document.author_full_name)
34 | 
35 |     for value in metadata.values():
36 |         if isinstance(value, dict) and "authors" in value:
37 |             value["authors"] = list(set(value["authors"]))
38 | 
39 |     return metadata
40 | 


--------------------------------------------------------------------------------
/steps/feature_engineering/load_to_vector_db.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | from typing_extensions import Annotated
 3 | from zenml import step
 4 | 
 5 | from llm_engineering.application import utils
 6 | from llm_engineering.domain.base import VectorBaseDocument
 7 | 
 8 | 
 9 | @step
10 | def load_to_vector_db(
11 |     documents: Annotated[list, "documents"],
12 | ) -> Annotated[bool, "successful"]:
13 |     logger.info(f"Loading {len(documents)} documents into the vector database.")
14 | 
15 |     grouped_documents = VectorBaseDocument.group_by_class(documents)
16 |     for document_class, documents in grouped_documents.items():
17 |         logger.info(f"Loading documents into {document_class.get_collection_name()}")
18 |         for documents_batch in utils.misc.batch(documents, size=4):
19 |             try:
20 |                 document_class.bulk_insert(documents_batch)
21 |             except Exception:
22 |                 logger.error(f"Failed to insert documents into {document_class.get_collection_name()}")
23 | 
24 |                 return False
25 | 
26 |     return True
27 | 


--------------------------------------------------------------------------------
/steps/feature_engineering/query_data_warehouse.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ThreadPoolExecutor, as_completed
 2 | 
 3 | from loguru import logger
 4 | from typing_extensions import Annotated
 5 | from zenml import get_step_context, step
 6 | 
 7 | from llm_engineering.application import utils
 8 | from llm_engineering.domain.base.nosql import NoSQLBaseDocument
 9 | from llm_engineering.domain.documents import ArticleDocument, Document, PostDocument, RepositoryDocument, UserDocument
10 | 
11 | 
12 | @step
13 | def query_data_warehouse(
14 |     author_full_names: list[str],
15 | ) -> Annotated[list, "raw_documents"]:
16 |     documents = []
17 |     authors = []
18 |     for author_full_name in author_full_names:
19 |         logger.info(f"Querying data warehouse for user: {author_full_name}")
20 | 
21 |         first_name, last_name = utils.split_user_full_name(author_full_name)
22 |         logger.info(f"First name: {first_name}, Last name: {last_name}")
23 |         user = UserDocument.get_or_create(first_name=first_name, last_name=last_name)
24 |         authors.append(user)
25 | 
26 |         results = fetch_all_data(user)
27 |         user_documents = [doc for query_result in results.values() for doc in query_result]
28 | 
29 |         documents.extend(user_documents)
30 | 
31 |     step_context = get_step_context()
32 |     step_context.add_output_metadata(output_name="raw_documents", metadata=_get_metadata(documents))
33 | 
34 |     return documents
35 | 
36 | 
37 | def fetch_all_data(user: UserDocument) -> dict[str, list[NoSQLBaseDocument]]:
38 |     user_id = str(user.id)
39 |     with ThreadPoolExecutor() as executor:
40 |         future_to_query = {
41 |             executor.submit(__fetch_articles, user_id): "articles",
42 |             executor.submit(__fetch_posts, user_id): "posts",
43 |             executor.submit(__fetch_repositories, user_id): "repositories",
44 |         }
45 | 
46 |         results = {}
47 |         for future in as_completed(future_to_query):
48 |             query_name = future_to_query[future]
49 |             try:
50 |                 results[query_name] = future.result()
51 |             except Exception:
52 |                 logger.exception(f"'{query_name}' request failed.")
53 | 
54 |                 results[query_name] = []
55 | 
56 |     return results
57 | 
58 | 
59 | def __fetch_articles(user_id) -> list[NoSQLBaseDocument]:
60 |     return ArticleDocument.bulk_find(author_id=user_id)
61 | 
62 | 
63 | def __fetch_posts(user_id) -> list[NoSQLBaseDocument]:
64 |     return PostDocument.bulk_find(author_id=user_id)
65 | 
66 | 
67 | def __fetch_repositories(user_id) -> list[NoSQLBaseDocument]:
68 |     return RepositoryDocument.bulk_find(author_id=user_id)
69 | 
70 | 
71 | def _get_metadata(documents: list[Document]) -> dict:
72 |     metadata = {
73 |         "num_documents": len(documents),
74 |     }
75 |     for document in documents:
76 |         collection = document.get_collection_name()
77 |         if collection not in metadata:
78 |             metadata[collection] = {}
79 |         if "authors" not in metadata[collection]:
80 |             metadata[collection]["authors"] = list()
81 | 
82 |         metadata[collection]["num_documents"] = metadata[collection].get("num_documents", 0) + 1
83 |         metadata[collection]["authors"].append(document.author_full_name)
84 | 
85 |     for value in metadata.values():
86 |         if isinstance(value, dict) and "authors" in value:
87 |             value["authors"] = list(set(value["authors"]))
88 | 
89 |     return metadata
90 | 


--------------------------------------------------------------------------------
/steps/feature_engineering/rag.py:
--------------------------------------------------------------------------------
 1 | from typing_extensions import Annotated
 2 | from zenml import get_step_context, step
 3 | 
 4 | from llm_engineering.application import utils
 5 | from llm_engineering.application.preprocessing import ChunkingDispatcher, EmbeddingDispatcher
 6 | from llm_engineering.domain.chunks import Chunk
 7 | from llm_engineering.domain.embedded_chunks import EmbeddedChunk
 8 | 
 9 | 
10 | @step
11 | def chunk_and_embed(
12 |     cleaned_documents: Annotated[list, "cleaned_documents"],
13 | ) -> Annotated[list, "embedded_documents"]:
14 |     metadata = {"chunking": {}, "embedding": {}, "num_documents": len(cleaned_documents)}
15 | 
16 |     embedded_chunks = []
17 |     for document in cleaned_documents:
18 |         chunks = ChunkingDispatcher.dispatch(document)
19 |         metadata["chunking"] = _add_chunks_metadata(chunks, metadata["chunking"])
20 | 
21 |         for batched_chunks in utils.misc.batch(chunks, 10):
22 |             batched_embedded_chunks = EmbeddingDispatcher.dispatch(batched_chunks)
23 |             embedded_chunks.extend(batched_embedded_chunks)
24 | 
25 |     metadata["embedding"] = _add_embeddings_metadata(embedded_chunks, metadata["embedding"])
26 |     metadata["num_chunks"] = len(embedded_chunks)
27 |     metadata["num_embedded_chunks"] = len(embedded_chunks)
28 | 
29 |     step_context = get_step_context()
30 |     step_context.add_output_metadata(output_name="embedded_documents", metadata=metadata)
31 | 
32 |     return embedded_chunks
33 | 
34 | 
35 | def _add_chunks_metadata(chunks: list[Chunk], metadata: dict) -> dict:
36 |     for chunk in chunks:
37 |         category = chunk.get_category()
38 |         if category not in metadata:
39 |             metadata[category] = chunk.metadata
40 |         if "authors" not in metadata[category]:
41 |             metadata[category]["authors"] = list()
42 | 
43 |         metadata[category]["num_chunks"] = metadata[category].get("num_chunks", 0) + 1
44 |         metadata[category]["authors"].append(chunk.author_full_name)
45 | 
46 |     for value in metadata.values():
47 |         if isinstance(value, dict) and "authors" in value:
48 |             value["authors"] = list(set(value["authors"]))
49 | 
50 |     return metadata
51 | 
52 | 
53 | def _add_embeddings_metadata(embedded_chunks: list[EmbeddedChunk], metadata: dict) -> dict:
54 |     for embedded_chunk in embedded_chunks:
55 |         category = embedded_chunk.get_category()
56 |         if category not in metadata:
57 |             metadata[category] = embedded_chunk.metadata
58 |         if "authors" not in metadata[category]:
59 |             metadata[category]["authors"] = list()
60 | 
61 |         metadata[category]["authors"].append(embedded_chunk.author_full_name)
62 | 
63 |     for value in metadata.values():
64 |         if isinstance(value, dict) and "authors" in value:
65 |             value["authors"] = list(set(value["authors"]))
66 | 
67 |     return metadata
68 | 


--------------------------------------------------------------------------------
/steps/generate_datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from .create_prompts import create_prompts
 2 | from .generate_intruction_dataset import generate_intruction_dataset
 3 | from .generate_preference_dataset import generate_preference_dataset
 4 | from .push_to_huggingface import push_to_huggingface
 5 | from .query_feature_store import query_feature_store
 6 | 
 7 | __all__ = [
 8 |     "generate_intruction_dataset",
 9 |     "generate_preference_dataset",
10 |     "create_prompts",
11 |     "push_to_huggingface",
12 |     "query_feature_store",
13 | ]
14 | 


--------------------------------------------------------------------------------
/steps/generate_datasets/create_prompts.py:
--------------------------------------------------------------------------------
 1 | from typing_extensions import Annotated
 2 | from zenml import get_step_context, step
 3 | 
 4 | from llm_engineering.application.dataset import generation
 5 | from llm_engineering.domain.dataset import DatasetType
 6 | from llm_engineering.domain.prompt import GenerateDatasetSamplesPrompt
 7 | from llm_engineering.domain.types import DataCategory
 8 | 
 9 | 
10 | @step
11 | def create_prompts(
12 |     documents: Annotated[list, "queried_cleaned_documents"],
13 |     dataset_type: Annotated[DatasetType, "dataset_type"],
14 | ) -> Annotated[dict[DataCategory, list[GenerateDatasetSamplesPrompt]], "prompts"]:
15 |     dataset_generator = generation.get_dataset_generator(dataset_type)
16 |     grouped_prompts = dataset_generator.get_prompts(documents)
17 | 
18 |     step_context = get_step_context()
19 |     step_context.add_output_metadata(output_name="prompts", metadata=_get_metadata(grouped_prompts))
20 | 
21 |     return grouped_prompts
22 | 
23 | 
24 | def _get_metadata(grouped_prompts: dict[DataCategory, list[GenerateDatasetSamplesPrompt]]) -> dict:
25 |     prompt_categories = list(grouped_prompts.keys())
26 |     prompt_num_samples = {category: len(prompts) for category, prompts in grouped_prompts.items()}
27 | 
28 |     return {"data_categories": prompt_categories, "data_categories_num_prompts": prompt_num_samples}
29 | 


--------------------------------------------------------------------------------
/steps/generate_datasets/generate_intruction_dataset.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from typing_extensions import Annotated
 4 | from zenml import ArtifactConfig, get_step_context, step
 5 | 
 6 | from llm_engineering.application.dataset import generation
 7 | from llm_engineering.domain.dataset import DatasetType, InstructTrainTestSplit
 8 | from llm_engineering.domain.prompt import GenerateDatasetSamplesPrompt
 9 | from llm_engineering.domain.types import DataCategory
10 | 
11 | 
12 | @step
13 | def generate_intruction_dataset(
14 |     prompts: Annotated[dict[DataCategory, list[GenerateDatasetSamplesPrompt]], "prompts"],
15 |     test_split_size: Annotated[float, "test_split_size"],
16 |     mock: Annotated[bool, "mock_generation"] = False,
17 | ) -> Annotated[
18 |     InstructTrainTestSplit,
19 |     ArtifactConfig(
20 |         name="instruct_datasets",
21 |         tags=["dataset", "instruct", "cleaned"],
22 |     ),
23 | ]:
24 |     dataset_generator = generation.get_dataset_generator(DatasetType.INSTRUCTION)
25 |     datasets = dataset_generator.generate(prompts, test_size=test_split_size, mock=mock)
26 | 
27 |     step_context = get_step_context()
28 |     step_context.add_output_metadata(output_name="instruct_datasets", metadata=_get_metadata_instruct_dataset(datasets))
29 | 
30 |     return datasets
31 | 
32 | 
33 | def _get_metadata_instruct_dataset(datasets: InstructTrainTestSplit) -> dict[str, Any]:
34 |     instruct_dataset_categories = list(datasets.train.keys())
35 |     train_num_samples = {
36 |         category: instruct_dataset.num_samples for category, instruct_dataset in datasets.train.items()
37 |     }
38 |     test_num_samples = {category: instruct_dataset.num_samples for category, instruct_dataset in datasets.test.items()}
39 | 
40 |     return {
41 |         "data_categories": instruct_dataset_categories,
42 |         "test_split_size": datasets.test_split_size,
43 |         "train_num_samples_per_category": train_num_samples,
44 |         "test_num_samples_per_category": test_num_samples,
45 |     }
46 | 


--------------------------------------------------------------------------------
/steps/generate_datasets/generate_preference_dataset.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from typing_extensions import Annotated
 4 | from zenml import ArtifactConfig, get_step_context, step
 5 | 
 6 | from llm_engineering.application.dataset import generation
 7 | from llm_engineering.domain.dataset import DatasetType, PreferenceTrainTestSplit
 8 | from llm_engineering.domain.prompt import GenerateDatasetSamplesPrompt
 9 | from llm_engineering.domain.types import DataCategory
10 | 
11 | 
12 | @step
13 | def generate_preference_dataset(
14 |     prompts: Annotated[dict[DataCategory, list[GenerateDatasetSamplesPrompt]], "prompts"],
15 |     test_split_size: Annotated[float, "test_split_size"],
16 |     mock: Annotated[bool, "mock_generation"] = False,
17 | ) -> Annotated[
18 |     PreferenceTrainTestSplit,
19 |     ArtifactConfig(
20 |         name="preference_datasets",
21 |         tags=["dataset", "preference", "cleaned"],
22 |     ),
23 | ]:
24 |     dataset_generator = generation.get_dataset_generator(DatasetType.PREFERENCE)
25 |     datasets = dataset_generator.generate(prompts, test_size=test_split_size, mock=mock)
26 | 
27 |     step_context = get_step_context()
28 |     step_context.add_output_metadata(
29 |         output_name="preference_datasets", metadata=_get_metadata_preference_dataset(datasets)
30 |     )
31 | 
32 |     return datasets
33 | 
34 | 
35 | def _get_metadata_preference_dataset(datasets: PreferenceTrainTestSplit) -> dict[str, Any]:
36 |     instruct_dataset_categories = list(datasets.train.keys())
37 |     train_num_samples = {
38 |         category: instruct_dataset.num_samples for category, instruct_dataset in datasets.train.items()
39 |     }
40 |     test_num_samples = {category: instruct_dataset.num_samples for category, instruct_dataset in datasets.test.items()}
41 | 
42 |     return {
43 |         "data_categories": instruct_dataset_categories,
44 |         "test_split_size": datasets.test_split_size,
45 |         "train_num_samples_per_category": train_num_samples,
46 |         "test_num_samples_per_category": test_num_samples,
47 |     }
48 | 


--------------------------------------------------------------------------------
/steps/generate_datasets/push_to_huggingface.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | from typing_extensions import Annotated
 3 | from zenml import step
 4 | 
 5 | from llm_engineering.domain.dataset import InstructTrainTestSplit, PreferenceTrainTestSplit
 6 | from llm_engineering.settings import settings
 7 | 
 8 | 
 9 | @step
10 | def push_to_huggingface(
11 |     dataset: Annotated[InstructTrainTestSplit | PreferenceTrainTestSplit, "dataset_split"],
12 |     dataset_id: Annotated[str, "dataset_id"],
13 | ) -> None:
14 |     assert dataset_id is not None, "Dataset id must be provided for pushing to Huggingface"
15 |     assert (
16 |         settings.HUGGINGFACE_ACCESS_TOKEN is not None
17 |     ), "Huggingface access token must be provided for pushing to Huggingface"
18 | 
19 |     logger.info(f"Pushing dataset {dataset_id} to Hugging Face.")
20 | 
21 |     huggingface_dataset = dataset.to_huggingface(flatten=True)
22 |     huggingface_dataset.push_to_hub(dataset_id, token=settings.HUGGINGFACE_ACCESS_TOKEN)
23 | 


--------------------------------------------------------------------------------
/steps/generate_datasets/query_feature_store.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ThreadPoolExecutor, as_completed
 2 | 
 3 | from loguru import logger
 4 | from qdrant_client.http import exceptions
 5 | from typing_extensions import Annotated
 6 | from zenml import step
 7 | 
 8 | from llm_engineering.domain.base.nosql import NoSQLBaseDocument
 9 | from llm_engineering.domain.cleaned_documents import (
10 |     CleanedArticleDocument,
11 |     CleanedDocument,
12 |     CleanedPostDocument,
13 |     CleanedRepositoryDocument,
14 | )
15 | 
16 | 
17 | @step
18 | def query_feature_store() -> Annotated[list, "queried_cleaned_documents"]:
19 |     logger.info("Querying feature store.")
20 | 
21 |     results = fetch_all_data()
22 | 
23 |     cleaned_documents = [doc for query_result in results.values() for doc in query_result]
24 | 
25 |     return cleaned_documents
26 | 
27 | 
28 | def fetch_all_data() -> dict[str, list[NoSQLBaseDocument]]:
29 |     with ThreadPoolExecutor() as executor:
30 |         future_to_query = {
31 |             executor.submit(
32 |                 __fetch_articles,
33 |             ): "articles",
34 |             executor.submit(
35 |                 __fetch_posts,
36 |             ): "posts",
37 |             executor.submit(
38 |                 __fetch_repositories,
39 |             ): "repositories",
40 |         }
41 | 
42 |         results = {}
43 |         for future in as_completed(future_to_query):
44 |             query_name = future_to_query[future]
45 |             try:
46 |                 results[query_name] = future.result()
47 |             except Exception:
48 |                 logger.exception(f"'{query_name}' request failed.")
49 | 
50 |                 results[query_name] = []
51 | 
52 |     return results
53 | 
54 | 
55 | def __fetch_articles() -> list[CleanedDocument]:
56 |     return __fetch(CleanedArticleDocument)
57 | 
58 | 
59 | def __fetch_posts() -> list[CleanedDocument]:
60 |     return __fetch(CleanedPostDocument)
61 | 
62 | 
63 | def __fetch_repositories() -> list[CleanedDocument]:
64 |     return __fetch(CleanedRepositoryDocument)
65 | 
66 | 
67 | def __fetch(cleaned_document_type: type[CleanedDocument], limit: int = 1) -> list[CleanedDocument]:
68 |     try:
69 |         cleaned_documents, next_offset = cleaned_document_type.bulk_find(limit=limit)
70 |     except exceptions.UnexpectedResponse:
71 |         return []
72 | 
73 |     while next_offset:
74 |         documents, next_offset = cleaned_document_type.bulk_find(limit=limit, offset=next_offset)
75 |         cleaned_documents.extend(documents)
76 | 
77 |     return cleaned_documents
78 | 


--------------------------------------------------------------------------------
/steps/training/__init__.py:
--------------------------------------------------------------------------------
1 | from .train import train
2 | 
3 | __all__ = ["train"]
4 | 


--------------------------------------------------------------------------------
/steps/training/train.py:
--------------------------------------------------------------------------------
 1 | from zenml import step
 2 | 
 3 | from llm_engineering.model.finetuning.sagemaker import run_finetuning_on_sagemaker
 4 | 
 5 | 
 6 | @step
 7 | def train(
 8 |     finetuning_type: str,
 9 |     num_train_epochs: int,
10 |     per_device_train_batch_size: int,
11 |     learning_rate: float,
12 |     dataset_huggingface_workspace: str = "mlabonne",
13 |     is_dummy: bool = False,
14 | ) -> None:
15 |     run_finetuning_on_sagemaker(
16 |         finetuning_type=finetuning_type,
17 |         num_train_epochs=num_train_epochs,
18 |         per_device_train_batch_size=per_device_train_batch_size,
19 |         learning_rate=learning_rate,
20 |         dataset_huggingface_workspace=dataset_huggingface_workspace,
21 |         is_dummy=is_dummy,
22 |     )
23 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/tests/__init__.py


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/tests/integration/__init__.py


--------------------------------------------------------------------------------
/tests/integration/integration_example_test.py:
--------------------------------------------------------------------------------
1 | def test_integration_example() -> None:
2 |     string = "integration_test_example"
3 | 
4 |     assert string == "integration_test_example"
5 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/unit_example_test.py:
--------------------------------------------------------------------------------
1 | def test_unit_example() -> None:
2 |     string = "unit_test_example"
3 | 
4 |     assert string == "unit_test_example"
5 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/LLM-Engineers-Handbook/5b7c5afdd6668226fb56183c64006eb7b51382c7/tools/__init__.py


--------------------------------------------------------------------------------
/tools/data_warehouse.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pathlib import Path
  3 | 
  4 | import click
  5 | from loguru import logger
  6 | 
  7 | from llm_engineering.domain.base.nosql import NoSQLBaseDocument
  8 | from llm_engineering.domain.documents import ArticleDocument, PostDocument, RepositoryDocument, UserDocument
  9 | 
 10 | 
 11 | @click.command()
 12 | @click.option(
 13 |     "--export-raw-data",
 14 |     is_flag=True,
 15 |     default=False,
 16 |     help="Whether to export your data warehouse to a JSON file.",
 17 | )
 18 | @click.option(
 19 |     "--import-raw-data",
 20 |     is_flag=True,
 21 |     default=False,
 22 |     help="Whether to import a JSON file into your data warehouse.",
 23 | )
 24 | @click.option(
 25 |     "--data-dir",
 26 |     default=Path("data/data_warehouse_raw_data"),
 27 |     type=Path,
 28 |     help="Path to the directory containing data warehouse raw data JSON files.",
 29 | )
 30 | def main(
 31 |     export_raw_data,
 32 |     import_raw_data,
 33 |     data_dir: Path,
 34 | ) -> None:
 35 |     assert export_raw_data or import_raw_data, "Specify at least one operation."
 36 | 
 37 |     if export_raw_data:
 38 |         __export(data_dir)
 39 | 
 40 |     if import_raw_data:
 41 |         __import(data_dir)
 42 | 
 43 | 
 44 | def __export(data_dir: Path) -> None:
 45 |     logger.info(f"Exporting data warehouse to {data_dir}...")
 46 |     data_dir.mkdir(parents=True, exist_ok=True)
 47 | 
 48 |     __export_data_category(data_dir, ArticleDocument)
 49 |     __export_data_category(data_dir, PostDocument)
 50 |     __export_data_category(data_dir, RepositoryDocument)
 51 |     __export_data_category(data_dir, UserDocument)
 52 | 
 53 | 
 54 | def __export_data_category(data_dir: Path, category_class: type[NoSQLBaseDocument]) -> None:
 55 |     data = category_class.bulk_find()
 56 |     serialized_data = [d.to_mongo() for d in data]
 57 |     export_file = data_dir / f"{category_class.__name__}.json"
 58 | 
 59 |     logger.info(f"Exporting {len(serialized_data)} items of {category_class.__name__} to {export_file}...")
 60 |     with export_file.open("w") as f:
 61 |         json.dump(serialized_data, f)
 62 | 
 63 | 
 64 | def __import(data_dir: Path) -> None:
 65 |     logger.info(f"Importing data warehouse from {data_dir}...")
 66 |     assert data_dir.is_dir(), f"{data_dir} is not a directory or it doesn't exists."
 67 | 
 68 |     data_category_classes = {
 69 |         "ArticleDocument": ArticleDocument,
 70 |         "PostDocument": PostDocument,
 71 |         "RepositoryDocument": RepositoryDocument,
 72 |         "UserDocument": UserDocument,
 73 |     }
 74 | 
 75 |     for file in data_dir.iterdir():
 76 |         if not file.is_file():
 77 |             continue
 78 | 
 79 |         category_class_name = file.stem
 80 |         category_class = data_category_classes.get(category_class_name)
 81 |         if not category_class:
 82 |             logger.warning(f"Skipping {file} as it does not match any data category.")
 83 |             continue
 84 | 
 85 |         __import_data_category(file, category_class)
 86 | 
 87 | 
 88 | def __import_data_category(file: Path, category_class: type[NoSQLBaseDocument]) -> None:
 89 |     with file.open("r") as f:
 90 |         data = json.load(f)
 91 | 
 92 |     logger.info(f"Importing {len(data)} items of {category_class.__name__} from {file}...")
 93 |     if len(data) > 0:
 94 |         deserialized_data = [category_class.from_mongo(d) for d in data]
 95 |         category_class.bulk_insert(deserialized_data)
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/tools/ml_service.py:
--------------------------------------------------------------------------------
1 | from llm_engineering.infrastructure.inference_pipeline_api import app  # noqa
2 | 
3 | if __name__ == "__main__":
4 |     import uvicorn
5 | 
6 |     uvicorn.run("tools.ml_service:app", host="0.0.0.0", port=8000, reload=True)
7 | 


--------------------------------------------------------------------------------
/tools/rag.py:
--------------------------------------------------------------------------------
 1 | from langchain.globals import set_verbose
 2 | from loguru import logger
 3 | 
 4 | from llm_engineering.application.rag.retriever import ContextRetriever
 5 | from llm_engineering.infrastructure.opik_utils import configure_opik
 6 | 
 7 | if __name__ == "__main__":
 8 |     configure_opik()
 9 |     set_verbose(True)
10 | 
11 |     query = """
12 |         My name is Paul Iusztin.
13 |         
14 |         Could you draft a LinkedIn post discussing RAG systems?
15 |         I'm particularly interested in:
16 |             - how RAG works
17 |             - how it is integrated with vector DBs and large language models (LLMs).
18 |         """
19 | 
20 |     retriever = ContextRetriever(mock=False)
21 |     documents = retriever.search(query, k=9)
22 | 
23 |     logger.info("Retrieved documents:")
24 |     for rank, document in enumerate(documents):
25 |         logger.info(f"{rank + 1}: {document}")
26 | 


--------------------------------------------------------------------------------
/tools/run.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime as dt
  2 | from pathlib import Path
  3 | 
  4 | import click
  5 | from loguru import logger
  6 | 
  7 | from llm_engineering import settings
  8 | from pipelines import (
  9 |     digital_data_etl,
 10 |     end_to_end_data,
 11 |     evaluating,
 12 |     export_artifact_to_json,
 13 |     feature_engineering,
 14 |     generate_datasets,
 15 |     training,
 16 | )
 17 | 
 18 | 
 19 | @click.command(
 20 |     help="""
 21 | LLM Engineering project CLI v0.0.1. 
 22 | 
 23 | Main entry point for the pipeline execution. 
 24 | This entrypoint is where everything comes together.
 25 | 
 26 | Run the ZenML LLM Engineering project pipelines with various options.
 27 | 
 28 | Run a pipeline with the required parameters. This executes
 29 | all steps in the pipeline in the correct order using the orchestrator
 30 | stack component that is configured in your active ZenML stack.
 31 | 
 32 | Examples:
 33 | 
 34 |   \b
 35 |   # Run the pipeline with default options
 36 |   python run.py
 37 |                
 38 |   \b
 39 |   # Run the pipeline without cache
 40 |   python run.py --no-cache
 41 |   
 42 |   \b
 43 |   # Run only the ETL pipeline
 44 |   python run.py --only-etl
 45 | 
 46 | """
 47 | )
 48 | @click.option(
 49 |     "--no-cache",
 50 |     is_flag=True,
 51 |     default=False,
 52 |     help="Disable caching for the pipeline run.",
 53 | )
 54 | @click.option(
 55 |     "--run-end-to-end-data",
 56 |     is_flag=True,
 57 |     default=False,
 58 |     help="Whether to run all the data pipelines in one go.",
 59 | )
 60 | @click.option(
 61 |     "--run-etl",
 62 |     is_flag=True,
 63 |     default=False,
 64 |     help="Whether to run the ETL pipeline.",
 65 | )
 66 | @click.option(
 67 |     "--run-export-artifact-to-json",
 68 |     is_flag=True,
 69 |     default=False,
 70 |     help="Whether to run the Artifact -> JSON pipeline",
 71 | )
 72 | @click.option(
 73 |     "--etl-config-filename",
 74 |     default="digital_data_etl_paul_iusztin.yaml",
 75 |     help="Filename of the ETL config file.",
 76 | )
 77 | @click.option(
 78 |     "--run-feature-engineering",
 79 |     is_flag=True,
 80 |     default=False,
 81 |     help="Whether to run the FE pipeline.",
 82 | )
 83 | @click.option(
 84 |     "--run-generate-instruct-datasets",
 85 |     is_flag=True,
 86 |     default=False,
 87 |     help="Whether to run the instruct dataset generation pipeline.",
 88 | )
 89 | @click.option(
 90 |     "--run-generate-preference-datasets",
 91 |     is_flag=True,
 92 |     default=False,
 93 |     help="Whether to run the preference dataset generation pipeline.",
 94 | )
 95 | @click.option(
 96 |     "--run-training",
 97 |     is_flag=True,
 98 |     default=False,
 99 |     help="Whether to run the training pipeline.",
100 | )
101 | @click.option(
102 |     "--run-evaluation",
103 |     is_flag=True,
104 |     default=False,
105 |     help="Whether to run the evaluation pipeline.",
106 | )
107 | @click.option(
108 |     "--export-settings",
109 |     is_flag=True,
110 |     default=False,
111 |     help="Whether to export your settings to ZenML or not.",
112 | )
113 | def main(
114 |     no_cache: bool = False,
115 |     run_end_to_end_data: bool = False,
116 |     run_etl: bool = False,
117 |     etl_config_filename: str = "digital_data_etl_paul_iusztin.yaml",
118 |     run_export_artifact_to_json: bool = False,
119 |     run_feature_engineering: bool = False,
120 |     run_generate_instruct_datasets: bool = False,
121 |     run_generate_preference_datasets: bool = False,
122 |     run_training: bool = False,
123 |     run_evaluation: bool = False,
124 |     export_settings: bool = False,
125 | ) -> None:
126 |     assert (
127 |         run_end_to_end_data
128 |         or run_etl
129 |         or run_export_artifact_to_json
130 |         or run_feature_engineering
131 |         or run_generate_instruct_datasets
132 |         or run_generate_preference_datasets
133 |         or run_training
134 |         or run_evaluation
135 |         or export_settings
136 |     ), "Please specify an action to run."
137 | 
138 |     if export_settings:
139 |         logger.info("Exporting settings to ZenML secrets.")
140 |         settings.export()
141 | 
142 |     pipeline_args = {
143 |         "enable_cache": not no_cache,
144 |     }
145 |     root_dir = Path(__file__).resolve().parent.parent
146 | 
147 |     if run_end_to_end_data:
148 |         run_args_end_to_end = {}
149 |         pipeline_args["config_path"] = root_dir / "configs" / "end_to_end_data.yaml"
150 |         assert pipeline_args["config_path"].exists(), f"Config file not found: {pipeline_args['config_path']}"
151 |         pipeline_args["run_name"] = f"end_to_end_data_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
152 |         end_to_end_data.with_options(**pipeline_args)(**run_args_end_to_end)
153 | 
154 |     if run_etl:
155 |         run_args_etl = {}
156 |         pipeline_args["config_path"] = root_dir / "configs" / etl_config_filename
157 |         assert pipeline_args["config_path"].exists(), f"Config file not found: {pipeline_args['config_path']}"
158 |         pipeline_args["run_name"] = f"digital_data_etl_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
159 |         digital_data_etl.with_options(**pipeline_args)(**run_args_etl)
160 | 
161 |     if run_export_artifact_to_json:
162 |         run_args_etl = {}
163 |         pipeline_args["config_path"] = root_dir / "configs" / "export_artifact_to_json.yaml"
164 |         assert pipeline_args["config_path"].exists(), f"Config file not found: {pipeline_args['config_path']}"
165 |         pipeline_args["run_name"] = f"export_artifact_to_json_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
166 |         export_artifact_to_json.with_options(**pipeline_args)(**run_args_etl)
167 | 
168 |     if run_feature_engineering:
169 |         run_args_fe = {}
170 |         pipeline_args["config_path"] = root_dir / "configs" / "feature_engineering.yaml"
171 |         pipeline_args["run_name"] = f"feature_engineering_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
172 |         feature_engineering.with_options(**pipeline_args)(**run_args_fe)
173 | 
174 |     if run_generate_instruct_datasets:
175 |         run_args_cd = {}
176 |         pipeline_args["config_path"] = root_dir / "configs" / "generate_instruct_datasets.yaml"
177 |         pipeline_args["run_name"] = f"generate_instruct_datasets_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
178 |         generate_datasets.with_options(**pipeline_args)(**run_args_cd)
179 | 
180 |     if run_generate_preference_datasets:
181 |         run_args_cd = {}
182 |         pipeline_args["config_path"] = root_dir / "configs" / "generate_preference_datasets.yaml"
183 |         pipeline_args["run_name"] = f"generate_preference_datasets_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
184 |         generate_datasets.with_options(**pipeline_args)(**run_args_cd)
185 | 
186 |     if run_training:
187 |         run_args_cd = {}
188 |         pipeline_args["config_path"] = root_dir / "configs" / "training.yaml"
189 |         pipeline_args["run_name"] = f"training_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
190 |         training.with_options(**pipeline_args)(**run_args_cd)
191 | 
192 |     if run_evaluation:
193 |         run_args_cd = {}
194 |         pipeline_args["config_path"] = root_dir / "configs" / "evaluating.yaml"
195 |         pipeline_args["run_name"] = f"evaluation_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}"
196 |         evaluating.with_options(**pipeline_args)(**run_args_cd)
197 | 
198 | 
199 | if __name__ == "__main__":
200 |     main()
201 | 


--------------------------------------------------------------------------------