",
9 | ]
10 | package-mode = false
11 | readme = "README.md"
12 |
13 |
14 | [tool.poetry.dependencies]
15 | python = ">=3.10, <3.12"
16 | pydantic = "^2.6.3"
17 | pydantic-settings = "^2.1.0"
18 | bytewax = "0.18.2"
19 | pika = "^1.3.2"
20 | qdrant-client = "^1.8.0"
21 | unstructured = "^0.12.6"
22 | langchain = "^0.1.13"
23 | sentence-transformers = "^2.6.1"
24 | instructorembedding = "^1.0.1"
25 | numpy = "^1.26.4"
26 | langchain-openai = "^0.1.3"
27 | gdown = "^5.1.0"
28 | pymongo = "^4.7.1"
29 | structlog = "^24.1.0"
30 | rich = "^13.7.1"
31 | pip = "^24.0"
32 | install = "^1.3.5"
33 | comet-ml = "^3.41.0"
34 | ruff = "^0.4.3"
35 | comet-llm = "^2.2.4"
36 | qwak-sdk = "^0.5.69"
37 | pandas = "^2.2.2"
38 | datasets = "^2.19.1"
39 | peft = "^0.11.1"
40 | bitsandbytes = "^0.43.1"
41 | qwak-inference = "^0.1.17"
42 |
43 |
44 | [build-system]
45 | requires = ["poetry-core"]
46 | build-backend = "poetry.core.masonry.api"
47 |
48 |
49 | [tool.ruff]
50 | line-length = 88
51 | select = ["F401", "F403"]
52 |
--------------------------------------------------------------------------------
/5-inference/rag/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J-coder118/LLM-Twin/707f9d8bb1cf402e04644bff9c5c521ce0938087/5-inference/rag/__init__.py
--------------------------------------------------------------------------------
/5-inference/rag/query_expanison.py:
--------------------------------------------------------------------------------
1 | from langchain_openai import ChatOpenAI
2 |
3 | from llm_components.chain import GeneralChain
4 | from llm_components.prompt_templates import QueryExpansionTemplate
5 | from config import settings
6 |
7 |
8 | class QueryExpansion:
9 | @staticmethod
10 | def generate_response(query: str, to_expand_to_n: int) -> list[str]:
11 | query_expansion_template = QueryExpansionTemplate()
12 | prompt_template = query_expansion_template.create_template(to_expand_to_n)
13 | model = ChatOpenAI(
14 | model=settings.OPENAI_MODEL_ID,
15 | api_key=settings.OPENAI_API_KEY,
16 | temperature=0,
17 | )
18 |
19 | chain = GeneralChain().get_chain(
20 | llm=model, output_key="expanded_queries", template=prompt_template
21 | )
22 |
23 | response = chain.invoke({"question": query})
24 | result = response["expanded_queries"]
25 |
26 | queries = result.strip().split(query_expansion_template.separator)
27 | stripped_queries = [
28 | stripped_item for item in queries if (stripped_item := item.strip())
29 | ]
30 |
31 | return stripped_queries
32 |
--------------------------------------------------------------------------------
/5-inference/rag/reranking.py:
--------------------------------------------------------------------------------
1 | from langchain_openai import ChatOpenAI
2 | from llm_components.chain import GeneralChain
3 | from llm_components.prompt_templates import RerankingTemplate
4 |
5 | from config import settings
6 |
7 |
8 | class Reranker:
9 | @staticmethod
10 | def generate_response(
11 | query: str, passages: list[str], keep_top_k: int
12 | ) -> list[str]:
13 | reranking_template = RerankingTemplate()
14 | prompt_template = reranking_template.create_template(keep_top_k=keep_top_k)
15 |
16 | model = ChatOpenAI(model=settings.OPENAI_MODEL_ID, api_key=settings.OPENAI_API_KEY)
17 | chain = GeneralChain().get_chain(
18 | llm=model, output_key="rerank", template=prompt_template
19 | )
20 |
21 | stripped_passages = [
22 | stripped_item for item in passages if (stripped_item := item.strip())
23 | ]
24 | passages = reranking_template.separator.join(stripped_passages)
25 | response = chain.invoke({"question": query, "passages": passages})
26 |
27 | result = response["rerank"]
28 | reranked_passages = result.strip().split(reranking_template.separator)
29 | stripped_passages = [
30 | stripped_item
31 | for item in reranked_passages
32 | if (stripped_item := item.strip())
33 | ]
34 |
35 | return stripped_passages
36 |
--------------------------------------------------------------------------------
/5-inference/rag/retriever.py:
--------------------------------------------------------------------------------
1 | import concurrent.futures
2 |
3 | import core.logger_utils as logger_utils
4 | from core.db.qdrant import QdrantDatabaseConnector
5 | from qdrant_client import models
6 | from sentence_transformers.SentenceTransformer import SentenceTransformer
7 |
8 | import utils
9 | from rag.query_expanison import QueryExpansion
10 | from rag.reranking import Reranker
11 | from rag.self_query import SelfQuery
12 | from config import settings
13 |
14 | logger = logger_utils.get_logger(__name__)
15 |
16 |
17 | class VectorRetriever:
18 | """
19 | Class for retrieving vectors from a Vector store in a RAG system using query expansion and Multitenancy search.
20 | """
21 |
22 | def __init__(self, query: str) -> None:
23 | self._client = QdrantDatabaseConnector()
24 | self.query = query
25 | self._embedder = SentenceTransformer(settings.EMBEDDING_MODEL_ID)
26 | self._query_expander = QueryExpansion()
27 | self._metadata_extractor = SelfQuery()
28 | self._reranker = Reranker()
29 |
30 | def _search_single_query(
31 | self, generated_query: str, metadata_filter_value: str, k: int
32 | ):
33 | assert k > 3, "k should be greater than 3"
34 |
35 | query_vector = self._embedder.encode(generated_query).tolist()
36 |
37 | vectors = [
38 | self._client.search(
39 | collection_name="vector_posts",
40 | query_filter=models.Filter(
41 | must=[
42 | models.FieldCondition(
43 | key="author_id",
44 | match=models.MatchValue(
45 | value=metadata_filter_value,
46 | ),
47 | )
48 | ]
49 | ),
50 | query_vector=query_vector,
51 | limit=k // 3,
52 | ),
53 | self._client.search(
54 | collection_name="vector_articles",
55 | query_filter=models.Filter(
56 | must=[
57 | models.FieldCondition(
58 | key="author_id",
59 | match=models.MatchValue(
60 | value=metadata_filter_value,
61 | ),
62 | )
63 | ]
64 | ),
65 | query_vector=query_vector,
66 | limit=k // 3,
67 | ),
68 | self._client.search(
69 | collection_name="vector_repositories",
70 | query_filter=models.Filter(
71 | must=[
72 | models.FieldCondition(
73 | key="owner_id",
74 | match=models.MatchValue(
75 | value=metadata_filter_value,
76 | ),
77 | )
78 | ]
79 | ),
80 | query_vector=query_vector,
81 | limit=k // 3,
82 | ),
83 | ]
84 |
85 | return utils.flatten(vectors)
86 |
87 | def retrieve_top_k(self, k: int, to_expand_to_n_queries: int) -> list:
88 | generated_queries = self._query_expander.generate_response(
89 | self.query, to_expand_to_n=to_expand_to_n_queries
90 | )
91 | logger.info(
92 | "Successfully generated queries for search.",
93 | num_queries=len(generated_queries),
94 | )
95 |
96 | author_id = self._metadata_extractor.generate_response(self.query)
97 | logger.info(
98 | "Successfully extracted the author_id from the query.",
99 | author_id=author_id,
100 | )
101 |
102 | with concurrent.futures.ThreadPoolExecutor() as executor:
103 | search_tasks = [
104 | executor.submit(self._search_single_query, query, author_id, k)
105 | for query in generated_queries
106 | ]
107 |
108 | hits = [
109 | task.result() for task in concurrent.futures.as_completed(search_tasks)
110 | ]
111 | hits = utils.flatten(hits)
112 |
113 | logger.info("All documents retrieved successfully.", num_documents=len(hits))
114 |
115 | return hits
116 |
117 | def rerank(self, hits: list, keep_top_k: int) -> list[str]:
118 | content_list = [hit.payload["content"] for hit in hits]
119 | rerank_hits = self._reranker.generate_response(
120 | query=self.query, passages=content_list, keep_top_k=keep_top_k
121 | )
122 |
123 | logger.info("Documents reranked successfully.", num_documents=len(rerank_hits))
124 |
125 | return rerank_hits
126 |
127 | def set_query(self, query: str):
128 | self.query = query
129 |
--------------------------------------------------------------------------------
/5-inference/rag/self_query.py:
--------------------------------------------------------------------------------
1 | from langchain_openai import ChatOpenAI
2 |
3 | from llm_components.chain import GeneralChain
4 | from llm_components.prompt_templates import SelfQueryTemplate
5 | from config import settings
6 |
7 |
8 | class SelfQuery:
9 | @staticmethod
10 | def generate_response(query: str) -> str:
11 | prompt = SelfQueryTemplate().create_template()
12 | model = ChatOpenAI(
13 | model=settings.OPENAI_MODEL_ID,
14 | api_key=settings.OPENAI_API_KEY,
15 | temperature=0,
16 | )
17 |
18 | chain = GeneralChain().get_chain(
19 | llm=model, output_key="metadata_filter_value", template=prompt
20 | )
21 |
22 | response = chain.invoke({"question": query})
23 | result = response["metadata_filter_value"]
24 |
25 | return result
26 |
--------------------------------------------------------------------------------
/5-inference/utils/__init__.py:
--------------------------------------------------------------------------------
1 | def flatten(nested_list: list) -> list:
2 | """Flatten a list of lists into a single list."""
3 |
4 | return [item for sublist in nested_list for item in sublist]
5 |
--------------------------------------------------------------------------------
/5-inference/utils/chunking.py:
--------------------------------------------------------------------------------
1 | from langchain.text_splitter import (
2 | RecursiveCharacterTextSplitter,
3 | SentenceTransformersTokenTextSplitter,
4 | )
5 |
6 | from config import settings
7 |
8 |
9 | def chunk_text(text: str) -> list[str]:
10 | character_splitter = RecursiveCharacterTextSplitter(
11 | separators=["\n\n"], chunk_size=500, chunk_overlap=0
12 | )
13 | text_split = character_splitter.split_text(text)
14 |
15 | token_splitter = SentenceTransformersTokenTextSplitter(
16 | chunk_overlap=50,
17 | tokens_per_chunk=settings.EMBEDDING_MODEL_MAX_INPUT_LENGTH,
18 | model_name=settings.EMBEDDING_MODEL_ID,
19 | )
20 | chunks = []
21 |
22 | for section in text_split:
23 | chunks.extend(token_splitter.split_text(section))
24 |
25 | return chunks
26 |
--------------------------------------------------------------------------------
/5-inference/utils/cleaning.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from unstructured.cleaners.core import (
4 | clean,
5 | clean_non_ascii_chars,
6 | replace_unicode_quotes,
7 | )
8 |
9 |
10 | def unbold_text(text):
11 | # Mapping of bold numbers to their regular equivalents
12 | bold_numbers = {
13 | "𝟬": "0",
14 | "𝟭": "1",
15 | "𝟮": "2",
16 | "𝟯": "3",
17 | "𝟰": "4",
18 | "𝟱": "5",
19 | "𝟲": "6",
20 | "𝟳": "7",
21 | "𝟴": "8",
22 | "𝟵": "9",
23 | }
24 |
25 | # Function to convert bold characters (letters and numbers)
26 | def convert_bold_char(match):
27 | char = match.group(0)
28 | # Convert bold numbers
29 | if char in bold_numbers:
30 | return bold_numbers[char]
31 | # Convert bold uppercase letters
32 | elif "\U0001d5d4" <= char <= "\U0001d5ed":
33 | return chr(ord(char) - 0x1D5D4 + ord("A"))
34 | # Convert bold lowercase letters
35 | elif "\U0001d5ee" <= char <= "\U0001d607":
36 | return chr(ord(char) - 0x1D5EE + ord("a"))
37 | else:
38 | return char # Return the character unchanged if it's not a bold number or letter
39 |
40 | # Regex for bold characters (numbers, uppercase, and lowercase letters)
41 | bold_pattern = re.compile(
42 | r"[\U0001D5D4-\U0001D5ED\U0001D5EE-\U0001D607\U0001D7CE-\U0001D7FF]"
43 | )
44 | text = bold_pattern.sub(convert_bold_char, text)
45 |
46 | return text
47 |
48 |
49 | def unitalic_text(text):
50 | # Function to convert italic characters (both letters)
51 | def convert_italic_char(match):
52 | char = match.group(0)
53 | # Unicode ranges for italic characters
54 | if "\U0001d608" <= char <= "\U0001d621": # Italic uppercase A-Z
55 | return chr(ord(char) - 0x1D608 + ord("A"))
56 | elif "\U0001d622" <= char <= "\U0001d63b": # Italic lowercase a-z
57 | return chr(ord(char) - 0x1D622 + ord("a"))
58 | else:
59 | return char # Return the character unchanged if it's not an italic letter
60 |
61 | # Regex for italic characters (uppercase and lowercase letters)
62 | italic_pattern = re.compile(r"[\U0001D608-\U0001D621\U0001D622-\U0001D63B]")
63 | text = italic_pattern.sub(convert_italic_char, text)
64 |
65 | return text
66 |
67 |
68 | def remove_emojis_and_symbols(text):
69 | # Extended pattern to include specific symbols like ↓ (U+2193) or ↳ (U+21B3)
70 | emoji_and_symbol_pattern = re.compile(
71 | "["
72 | "\U0001f600-\U0001f64f" # emoticons
73 | "\U0001f300-\U0001f5ff" # symbols & pictographs
74 | "\U0001f680-\U0001f6ff" # transport & map symbols
75 | "\U0001f1e0-\U0001f1ff" # flags (iOS)
76 | "\U00002193" # downwards arrow
77 | "\U000021b3" # downwards arrow with tip rightwards
78 | "\U00002192" # rightwards arrow
79 | "]+",
80 | flags=re.UNICODE,
81 | )
82 |
83 | return emoji_and_symbol_pattern.sub(r" ", text)
84 |
85 |
86 | def replace_urls_with_placeholder(text, placeholder="[URL]"):
87 | # Regular expression pattern for matching URLs
88 | url_pattern = r"https?://\S+|www\.\S+"
89 |
90 | return re.sub(url_pattern, placeholder, text)
91 |
92 |
93 | def remove_non_ascii(text: str) -> str:
94 | text = text.encode("ascii", "ignore").decode("ascii")
95 | return text
96 |
97 |
98 | def clean_text(text_content: str) -> str:
99 | cleaned_text = unbold_text(text_content)
100 | cleaned_text = unitalic_text(cleaned_text)
101 | cleaned_text = remove_emojis_and_symbols(cleaned_text)
102 | cleaned_text = clean(cleaned_text)
103 | cleaned_text = replace_unicode_quotes(cleaned_text)
104 | cleaned_text = clean_non_ascii_chars(cleaned_text)
105 | cleaned_text = replace_urls_with_placeholder(cleaned_text)
106 |
107 | return cleaned_text
108 |
--------------------------------------------------------------------------------
/5-inference/utils/embeddings.py:
--------------------------------------------------------------------------------
1 | from InstructorEmbedding import INSTRUCTOR
2 | from sentence_transformers.SentenceTransformer import SentenceTransformer
3 |
4 | from config import settings
5 |
6 |
7 | def embedd_text(text: str):
8 | model = SentenceTransformer(settings.EMBEDDING_MODEL_ID)
9 | return model.encode(text)
10 |
11 |
12 | def embedd_repositories(text: str):
13 | model = INSTRUCTOR("hkunlp/instructor-xl")
14 | sentence = text
15 | instruction = "Represent the structure of the repository"
16 | return model.encode([instruction, sentence])
17 |
--------------------------------------------------------------------------------
/GENERATE_INSTRUCT_DATASET.md:
--------------------------------------------------------------------------------
1 | # Generate Data for LLM finetuning task component
2 |
3 | ## Component Structure
4 |
5 | ### File Handling
6 | - `file_handler.py`: Manages file I/O operations, enabling reading and writing of JSON formatted data.
7 |
8 | ### LLM Communication
9 | - `llm_communication.py`: Handles communication with OpenAI's LLMs, sending prompts and processing responses.
10 |
11 | ### Data Generation
12 | - `generate_data.py`: Orchestrates the generation of training data by integrating file handling, LLM communication, and data formatting.
13 |
14 |
15 | ### Usage
16 |
17 | The project includes a `Makefile` for easy management of common tasks. Here are the main commands you can use:
18 |
19 | - `make help`: Displays help for each make command.
20 | - `make local-start`: Build and start mongodb, mq and qdrant.
21 | - `make local-test-github`: Insert data to mongodb
22 | - `make generate-dataset`: Generate dataset for finetuning and version it in CometML
--------------------------------------------------------------------------------
/INSTALL_AND_USAGE.md:
--------------------------------------------------------------------------------
1 | # Local Install
2 |
3 | ## System dependencies
4 |
5 | Before starting to install the LLM Twin project, make sure you have installed the following dependencies on your system:
6 |
7 | - [Docker ">=v27.0.3"](https://www.docker.com/)
8 | - [GNU Make ">=3.81"](https://www.gnu.org/software/make/)
9 |
10 | The whole LLM Twin application will be run locally using Docker.
11 |
12 | ## Configure
13 |
14 | All the sensitive credentials are placed in a `.env` file that will always sit on your hardware.
15 |
16 | Go to the root of the repository, copy our `.env.example` file and fill it with your credentials:
17 | ```shell
18 | cp .env.example .env
19 | ```
20 |
21 | ## Supported commands
22 |
23 | We will use `GNU Make` to install and run our application.
24 |
25 | To see all our supported commands, run the following:
26 | ```shell
27 | make help
28 | ```
29 |
30 | ## Set up the infrastructure
31 |
32 | ### Spin up the infrastructure
33 |
34 | Now, the whole infrastructure can be spun up using a simple Make command:
35 |
36 | ```shell
37 | make local-start
38 | ```
39 |
40 | Behind the scenes it will build and run all the Docker images defined in the [docker-compose.yml](https://github.com/decodingml/llm-twin-course/blob/main/docker-compose.yml) file.
41 |
42 | ## Read this before starting 🚨
43 |
44 | > [!CAUTION]
45 | > For `Mongo` to work with multiple replicas (as we use it in our Docker setup) on `macOS` or `Linux` systems, you have to add the following lines of code to `/etc/hosts`:
46 | >
47 | > ```
48 | > 127.0.0.1 mongo1
49 | > 127.0.0.1 mongo2
50 | > 127.0.0.1 mongo3
51 | > ```
52 | >
53 | > From what we know, on `Windows`, it `works out-of-the-box`. For more details, check out this article: https://medium.com/workleap/the-only-local-mongodb-replica-set-with-docker-compose-guide-youll-ever-need-2f0b74dd8384
54 |
55 | > [!WARNING]
56 | > For `arm` users (e.g., `M1/M2/M3 macOS devices`), go to your Docker desktop application and enable `Use Rosetta for x86_64/amd64 emulation on Apple Silicon` from the Settings. There is a checkbox you have to check.
57 | > Otherwise, your Docker containers will crash.
58 |
59 | ### Tear down the infrastructure
60 |
61 | Run the following `Make` command to tear down all your docker containers:
62 |
63 | ```shell
64 | make local-stop
65 | ```
66 |
67 | ## Run an end-to-end flow
68 |
69 | Now that we have configured our credentials and started our infrastructure let's look at how to run an end-to-end flow of the LLM Twin application.
70 |
71 | > [!IMPORTANT]
72 | > Note that we won't go into the details of the system here. To fully understand it, check out our free article series, which explains everything step-by-step: [LLM Twin articles series](https://medium.com/decodingml/llm-twin-course/home).
73 |
74 | ### Step 1: Crawlers
75 |
76 | Trigger the crawler to collect data and add it to the MongoDB:
77 |
78 | ```shell
79 | make local-test-github
80 | # or make local-test-medium
81 | ```
82 |
83 | After the data is added to Mongo, the CDC component will be triggered, which will populate the RabbitMQ with the event.
84 |
85 | ### Step 2: Feature engineering & Vector DB
86 |
87 | Check that the feature pipeline works and the vector DB is successfully populated.
88 |
89 | To check the `feature pipeline`, check the logs of the `llm-twin-bytewax` Docker container by running:
90 | ```shell
91 | docker logs llm-twin-bytewax
92 | ```
93 | You should see logs reflecting the cleaning, chunking, and embedding operations (without any errors, of course).
94 |
95 | To check that the Qdrant `vector DB` is populated successfully, go to its dashboard at [localhost:6333/dashboard](localhost:6333/dashboard). There, you should see the repositories or article collections created and populated.
96 |
97 | > [!NOTE]
98 | > If using the cloud version of Qdrant, go to your Qdrant account and cluster to see the same thing as in the local dashboard.
99 |
100 | ### Step 3: RAG retrieval step
101 |
102 | Now that we have some data in our vector DB, let's test out the RAG retriever:
103 | ```shell
104 | make local-test-retriever
105 | ```
106 |
107 | > [!IMPORTANT]
108 | > Before running this command, check [Qdrant's dashboard](localhost:6333/dashboard) to ensure that your vector DB is populated with data.
109 |
110 | > [!NOTE]
111 | > For more details on the RAG component, please refer to the [RAG](https://github.com/decodingml/llm-twin-course/blob/main/RAG.md) document.
112 |
113 |
114 | ### Step 4: Generate the instruct dataset
115 |
116 | The last step, before fine-tuning is to generate an instruct dataset and track it as an artifact in Comet ML. To do so, run:
117 | ```shell
118 | make generate-dataset
119 | ```
120 |
121 | > [!IMPORTANT]
122 | > Now open [Comet ML](https://www.comet.com/signup/?utm_source=decoding_ml&utm_medium=partner&utm_content=github), go to your workspace, and open the `Artifacts` tab. There, you should find three artifacts as follows:
123 | > - `articles-instruct-dataset`
124 | > - `posts-instruct-dataset`
125 | > - `repositories-instruct-dataset`
126 |
127 | > [!NOTE]
128 | > For more details on generating the instruct dataset component, please refer to the [GENERATE_INSTRUCT_DATASET](https://github.com/decodingml/llm-twin-course/blob/main/GENERATE_INSTRUCT_DATASET.md) document.
129 |
130 |
131 | ### Step 5: Fine-tuning
132 |
133 | For details on setting up the training pipeline on [Qwak](https://www.qwak.com/lp/end-to-end-mlops/?utm_source=github&utm_medium=referral&utm_campaign=decodingml) and running it, please refer to the [TRAINING](https://github.com/decodingml/llm-twin-course/blob/main/TRAINING.md) document.
134 |
135 | ### Step 6: Inference
136 |
137 | After you have finetuned your model, the first step is to deploy the inference pipeline to Qwak as a REST API service:
138 | ```shell
139 | deploy-inference-pipeline
140 | ```
141 |
142 | > [!NOTE]
143 | > You can check out the progress of the deployment on [Qwak](https://www.qwak.com/lp/end-to-end-mlops/?utm_source=github&utm_medium=referral&utm_campaign=decodingml).
144 |
145 | After the deployment is finished (it will take a while), you can call it by calling:
146 | ```shell
147 | make call-inference-pipeline
148 | ```
149 |
150 | Ultimately, after you stop using it, make sure to delete the deployment by running:
151 | ```shell
152 | make undeploy-infernece-pipeline
153 | ```
154 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Decoding ML
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | include .env
2 |
3 | $(eval export $(shell sed -ne 's/ *#.*$$//; /./ s/=.*$$// p' .env))
4 |
5 | AWS_CURRENT_REGION_ID := $(shell aws configure get region)
6 | AWS_CURRENT_ACCOUNT_ID := $(shell aws sts get-caller-identity --query "Account" --output text)
7 |
8 | PYTHONPATH := $(shell pwd)
9 |
10 | .PHONY: build-all env-var
11 |
12 | RED := \033[0;31m
13 | BLUE := \033[0;34m
14 | GREEN := \033[0;32m
15 | YELLOW := \033[0;33m
16 | RESET := \033[0m
17 |
18 | env-var:
19 | @echo "Environment variable VAR is: ${RABBITMQ_HOST}"
20 |
21 | help:
22 | @grep -E '^[a-zA-Z0-9 -]+:.*#' Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m:$$(echo $$l | cut -f 2- -d'#')\n"; done
23 |
24 |
25 | # ------ Infrastructure ------
26 |
27 | push: # Build & push image to docker ECR (e.g make push IMAGE_TAG=latest)
28 | echo "Logging into AWS ECR..."
29 | aws ecr get-login-password --region $(AWS_CURRENT_REGION_ID) | docker login --username AWS --password-stdin $(AWS_CURRENT_ACCOUNT_ID).dkr.ecr.$(AWS_CURRENT_REGION_ID).amazonaws.com
30 | echo "Build & Push Docker image..."
31 | docker buildx build --platform linux/amd64 -t $(AWS_CURRENT_ACCOUNT_ID).dkr.ecr.$(AWS_CURRENT_REGION_ID).amazonaws.com/crawler:$(IMAGE_TAG) .
32 | echo "Push completed successfully."
33 |
34 | local-start: # Buil and start local infrastructure.
35 | docker compose -f docker-compose.yml up --build -d
36 |
37 | local-stop: # Stop local infrastructure.
38 | docker compose -f docker-compose.yml down --remove-orphans
39 |
40 |
41 | # ------ Crawler ------
42 |
43 | local-test-medium: # Send test command on local to test the lambda with a Medium article
44 | curl -X POST "http://localhost:9010/2015-03-31/functions/function/invocations" \
45 | -d '{"user": "Paul Iuztin", "link": "https://medium.com/decodingml/an-end-to-end-framework-for-production-ready-llm-systems-by-building-your-llm-twin-2cc6bb01141f"}'
46 |
47 | local-test-github: # Send test command on local to test the lambda with a Github repository
48 | curl -X POST "http://localhost:9010/2015-03-31/functions/function/invocations" \
49 | -d '{"user": "Paul Iuztin", "link": "https://github.com/decodingml/llm-twin-course"}'
50 |
51 | cloud-test-github: # Send command to the cloud lambda with a Github repository
52 | aws lambda invoke \
53 | --function-name crawler \
54 | --cli-binary-format raw-in-base64-out \
55 | --payload '{"user": "Paul Iuztin", "link": "https://github.com/decodingml/llm-twin-course"}' \
56 | response.json
57 |
58 | # ------ RAG Feature Pipeline ------
59 |
60 | local-feature-pipeline: # Run the RAG feature pipeline
61 | RUST_BACKTRACE=full poetry run python -m bytewax.run 3-feature-pipeline/main.py
62 |
63 | generate-dataset: # Generate dataset for finetuning and version it in Comet ML
64 | docker exec -it llm-twin-bytewax python -m finetuning.generate_data
65 |
66 | # ------ RAG ------
67 |
68 | local-test-retriever: # Test retriever
69 | docker exec -it llm-twin-bytewax python -m retriever
70 |
71 | # ------ Qwak: Training pipeline ------
72 |
73 | create-qwak-project: # Create Qwak project for serving the model
74 | @echo "$(YELLOW)Creating Qwak project $(RESET)"
75 | qwak models create "llm_twin" --project "llm-twin-course"
76 |
77 | local-test-training-pipeline: # Test Qwak model locally
78 | poetry run python test_local.py
79 |
80 | deploy-training-pipeline: # Deploy the model to Qwak
81 | @echo "$(YELLOW)Dumping poetry env requirements to $(RESET) $(GREEN) requirements.txt $(RESET)"
82 | poetry export -f requirements.txt --output finetuning/requirements.txt --without-hashes
83 | @echo "$(GREEN)Triggering Qwak Model Build$(RESET)"
84 | poetry run qwak models build -f build_config.yaml .
85 |
86 |
87 | # ------ Qwak: Inference pipeline ------
88 |
89 | deploy-inference-pipeline: # Deploy the inference pipeline to Qwak.
90 | poetry run qwak models deploy realtime --model-id "llm_twin" --instance "gpu.a10.2xl" --timeout 50000 --replicas 2 --server-workers 2
91 |
92 | undeploy-infernece-pipeline: # Remove the inference pipeline deployment from Qwak.
93 | poetry run qwak models undeploy --model-id "llm_twin"
94 |
95 | call-inference-pipeline: # Call the inference pipeline.
96 | poetry run python main.py
97 |
98 | # ------ Superlinked Bonus Series ------
99 |
100 | local-start-superlinked: # Buil and start local infrastructure used in the Superlinked series.
101 | docker compose -f docker-compose-superlinked.yml up --build -d
102 |
103 | local-stop-superlinked: # Stop local infrastructure used in the Superlinked series.
104 | docker compose -f docker-compose-superlinked.yml down --remove-orphans
105 |
106 | test-superlinked-server:
107 | poetry run python 6-bonus-superlinked-rag/local_test.py
108 |
109 | local-bytewax-superlinked: # Run bytewax pipeline powered by superlinked
110 | RUST_BACKTRACE=full poetry run python -m bytewax.run 6-bonus-superlinked-rag/main.py
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
LLM Twin: Building Production-Ready AI Replica
3 | Built production-ready LLM & RAG system by building LLM Twin
4 | From data gathering to productionizing LLMs using LLMOps good practices.
5 |
6 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | from pydantic_settings import BaseSettings, SettingsConfigDict
2 |
3 |
4 | class Settings(BaseSettings):
5 | model_config = SettingsConfigDict(env_file="../.env", env_file_encoding="utf-8")
6 |
7 | MONGO_DATABASE_HOST: str = (
8 | "mongodb://mongo1:30001,mongo2:30002,mongo3:30003/?replicaSet=my-replica-set"
9 | )
10 | MONGO_DATABASE_NAME: str = "scrabble"
11 |
12 | # Optional LinkedIn credentials for scraping your profile
13 | LINKEDIN_USERNAME: str | None = None
14 | LINKEDIN_PASSWORD: str | None = None
15 |
16 |
17 | settings = Settings()
18 |
--------------------------------------------------------------------------------
/crawlers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J-coder118/LLM-Twin/707f9d8bb1cf402e04644bff9c5c521ce0938087/crawlers/__init__.py
--------------------------------------------------------------------------------
/crawlers/base.py:
--------------------------------------------------------------------------------
1 | import time
2 | from abc import ABC, abstractmethod
3 | from tempfile import mkdtemp
4 |
5 | from db.documents import BaseDocument
6 | from selenium import webdriver
7 | from selenium.webdriver.chrome.options import Options
8 |
9 |
10 | class BaseCrawler(ABC):
11 | model: type[BaseDocument]
12 |
13 | @abstractmethod
14 | def extract(self, link: str, **kwargs) -> None: ...
15 |
16 |
17 | class BaseAbstractCrawler(BaseCrawler, ABC):
18 | def __init__(self, scroll_limit: int = 5) -> None:
19 | options = webdriver.ChromeOptions()
20 | options.binary_location = "/opt/chrome/chrome"
21 | options.add_argument("--no-sandbox")
22 | options.add_argument("--headless=new")
23 | options.add_argument("--single-process")
24 | options.add_argument("--disable-dev-shm-usage")
25 | options.add_argument("--disable-gpu")
26 | options.add_argument("--log-level=3")
27 | options.add_argument("--disable-popup-blocking")
28 | options.add_argument("--disable-notifications")
29 | options.add_argument("--disable-dev-tools")
30 | options.add_argument("--ignore-certificate-errors")
31 | options.add_argument("--no-zygote")
32 | options.add_argument(f"--user-data-dir={mkdtemp()}")
33 | options.add_argument(f"--data-path={mkdtemp()}")
34 | options.add_argument(f"--disk-cache-dir={mkdtemp()}")
35 | options.add_argument("--remote-debugging-port=9222")
36 |
37 | self.set_extra_driver_options(options)
38 |
39 | self.scroll_limit = scroll_limit
40 | self.driver = webdriver.Chrome(
41 | service=webdriver.ChromeService("/opt/chromedriver"),
42 | options=options,
43 | )
44 |
45 | def set_extra_driver_options(self, options: Options) -> None:
46 | pass
47 |
48 | def login(self) -> None:
49 | pass
50 |
51 | def scroll_page(self) -> None:
52 | """Scroll through the LinkedIn page based on the scroll limit."""
53 | current_scroll = 0
54 | last_height = self.driver.execute_script("return document.body.scrollHeight")
55 | while True:
56 | self.driver.execute_script(
57 | "window.scrollTo(0, document.body.scrollHeight);"
58 | )
59 | time.sleep(5)
60 | new_height = self.driver.execute_script("return document.body.scrollHeight")
61 | if new_height == last_height or (
62 | self.scroll_limit and current_scroll >= self.scroll_limit
63 | ):
64 | break
65 | last_height = new_height
66 | current_scroll += 1
67 |
--------------------------------------------------------------------------------
/crawlers/github.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import subprocess
4 | import tempfile
5 |
6 | from aws_lambda_powertools import Logger
7 |
8 | from crawlers.base import BaseCrawler
9 | from db.documents import RepositoryDocument
10 |
11 | logger = Logger(service="llm-twin-course/crawler")
12 |
13 |
14 | class GithubCrawler(BaseCrawler):
15 | model = RepositoryDocument
16 |
17 | def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
18 | super().__init__()
19 | self._ignore = ignore
20 |
21 | def extract(self, link: str, **kwargs) -> None:
22 | logger.info(f"Starting scrapping GitHub repository: {link}")
23 |
24 | repo_name = link.rstrip("/").split("/")[-1]
25 |
26 | local_temp = tempfile.mkdtemp()
27 |
28 | try:
29 | os.chdir(local_temp)
30 | subprocess.run(["git", "clone", link])
31 |
32 | repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])
33 |
34 | tree = {}
35 | for root, dirs, files in os.walk(repo_path):
36 | dir = root.replace(repo_path, "").lstrip("/")
37 | if dir.startswith(self._ignore):
38 | continue
39 |
40 | for file in files:
41 | if file.endswith(self._ignore):
42 | continue
43 | file_path = os.path.join(dir, file)
44 | with open(os.path.join(root, file), "r", errors="ignore") as f:
45 | tree[file_path] = f.read().replace(" ", "")
46 |
47 | instance = self.model(
48 | name=repo_name, link=link, content=tree, owner_id=kwargs.get("user")
49 | )
50 | instance.save()
51 |
52 | except Exception:
53 | raise
54 | finally:
55 | shutil.rmtree(local_temp)
56 |
57 | logger.info(f"Finished scrapping GitHub repository: {link}")
58 |
--------------------------------------------------------------------------------
/crawlers/linkedin.py:
--------------------------------------------------------------------------------
1 | import time
2 | from typing import Dict, List
3 |
4 | from aws_lambda_powertools import Logger
5 | from bs4 import BeautifulSoup
6 | from bs4.element import Tag
7 | from errors import ImproperlyConfigured
8 | from selenium.webdriver.common.by import By
9 |
10 | from db.documents import PostDocument
11 | from crawlers.base import BaseAbstractCrawler
12 | from config import settings
13 |
14 | logger = Logger(service="decodingml/crawler")
15 |
16 |
17 | class LinkedInCrawler(BaseAbstractCrawler):
18 | model = PostDocument
19 |
20 | def set_extra_driver_options(self, options) -> None:
21 | options.add_experimental_option("detach", True)
22 |
23 | def extract(self, link: str, **kwargs):
24 | logger.info(f"Starting scrapping data for profile: {link}")
25 |
26 | self.login()
27 |
28 | soup = self._get_page_content(link)
29 |
30 | data = {
31 | "Name": self._scrape_section(soup, "h1", class_="text-heading-xlarge"),
32 | "About": self._scrape_section(soup, "div", class_="display-flex ph5 pv3"),
33 | "Main Page": self._scrape_section(soup, "div", {"id": "main-content"}),
34 | "Experience": self._scrape_experience(link),
35 | "Education": self._scrape_education(link),
36 | }
37 |
38 | self.driver.get(link)
39 | time.sleep(5)
40 | button = self.driver.find_element(
41 | By.CSS_SELECTOR,
42 | ".app-aware-link.profile-creator-shared-content-view__footer-action",
43 | )
44 | button.click()
45 |
46 | # Scrolling and scraping posts
47 | self.scroll_page()
48 | soup = BeautifulSoup(self.driver.page_source, "html.parser")
49 | post_elements = soup.find_all(
50 | "div",
51 | class_="update-components-text relative update-components-update-v2__commentary",
52 | )
53 | buttons = soup.find_all("button", class_="update-components-image__image-link")
54 | post_images = self._extract_image_urls(buttons)
55 |
56 | posts = self._extract_posts(post_elements, post_images)
57 | logger.info(f"Found {len(posts)} posts for profile: {link}")
58 |
59 | self.driver.close()
60 |
61 | self.model.bulk_insert(
62 | [
63 | PostDocument(
64 | platform="linkedin", content=post, author_id=kwargs.get("user")
65 | )
66 | for post in posts
67 | ]
68 | )
69 |
70 | logger.info(f"Finished scrapping data for profile: {link}")
71 |
72 | def _scrape_section(self, soup: BeautifulSoup, *args, **kwargs) -> str:
73 | """Scrape a specific section of the LinkedIn profile."""
74 | # Example: Scrape the 'About' section
75 | parent_div = soup.find(*args, **kwargs)
76 | return parent_div.get_text(strip=True) if parent_div else ""
77 |
78 | def _extract_image_urls(self, buttons: List[Tag]) -> Dict[str, str]:
79 | """
80 | Extracts image URLs from button elements.
81 |
82 | Args:
83 | buttons (List[Tag]): A list of BeautifulSoup Tag objects representing buttons.
84 |
85 | Returns:
86 | Dict[str, str]: A dictionary mapping post indexes to image URLs.
87 | """
88 | post_images = {}
89 | for i, button in enumerate(buttons):
90 | img_tag = button.find("img")
91 | if img_tag and "src" in img_tag.attrs:
92 | post_images[f"Post_{i}"] = img_tag["src"]
93 | else:
94 | logger.warning("No image found in this button")
95 | return post_images
96 |
97 | def _get_page_content(self, url: str) -> BeautifulSoup:
98 | """Retrieve the page content of a given URL."""
99 | self.driver.get(url)
100 | time.sleep(5)
101 | return BeautifulSoup(self.driver.page_source, "html.parser")
102 |
103 | def _extract_posts(
104 | self, post_elements: List[Tag], post_images: Dict[str, str]
105 | ) -> Dict[str, Dict[str, str]]:
106 | """
107 | Extracts post texts and combines them with their respective images.
108 |
109 | Args:
110 | post_elements (List[Tag]): A list of BeautifulSoup Tag objects representing post elements.
111 | post_images (Dict[str, str]): A dictionary containing image URLs mapped by post index.
112 |
113 | Returns:
114 | Dict[str, Dict[str, str]]: A dictionary containing post data with text and optional image URL.
115 | """
116 | posts_data = {}
117 | for i, post_element in enumerate(post_elements):
118 | post_text = post_element.get_text(strip=True, separator="\n")
119 | post_data = {"text": post_text}
120 | if f"Post_{i}" in post_images:
121 | post_data["image"] = post_images[f"Post_{i}"]
122 | posts_data[f"Post_{i}"] = post_data
123 | return posts_data
124 |
125 | def _scrape_experience(self, profile_url: str) -> str:
126 | """Scrapes the Experience section of the LinkedIn profile."""
127 | self.driver.get(profile_url + "/details/experience/")
128 | time.sleep(5)
129 | soup = BeautifulSoup(self.driver.page_source, "html.parser")
130 | experience_content = soup.find("section", {"id": "experience-section"})
131 | return experience_content.get_text(strip=True) if experience_content else ""
132 |
133 | def _scrape_education(self, profile_url: str) -> str:
134 | self.driver.get(profile_url + "/details/education/")
135 | time.sleep(5)
136 | soup = BeautifulSoup(self.driver.page_source, "html.parser")
137 | education_content = soup.find("section", {"id": "education-section"})
138 | return education_content.get_text(strip=True) if education_content else ""
139 |
140 | def login(self):
141 | """Log in to LinkedIn."""
142 | self.driver.get("https://www.linkedin.com/login")
143 | if not settings.LINKEDIN_USERNAME and not settings.LINKEDIN_PASSWORD:
144 | raise ImproperlyConfigured(
145 | "LinkedIn scraper requires an valid account to perform extraction"
146 | )
147 |
148 | self.driver.find_element(By.ID, "username").send_keys(
149 | settings.LINKEDIN_USERNAME
150 | )
151 | self.driver.find_element(By.ID, "password").send_keys(
152 | settings.LINKEDIN_PASSWORD
153 | )
154 | self.driver.find_element(
155 | By.CSS_SELECTOR, ".login__form_action_container button"
156 | ).click()
157 |
--------------------------------------------------------------------------------
/crawlers/medium.py:
--------------------------------------------------------------------------------
1 | import time
2 | from typing import Dict, List
3 |
4 | from aws_lambda_powertools import Logger
5 | from bs4 import BeautifulSoup
6 | from bs4.element import Tag
7 | from errors import ImproperlyConfigured
8 | from selenium.webdriver.common.by import By
9 |
10 | from db.documents import PostDocument
11 | from crawlers.base import BaseAbstractCrawler
12 | from config import settings
13 |
14 | logger = Logger(service="decodingml/crawler")
15 |
16 |
17 | class LinkedInCrawler(BaseAbstractCrawler):
18 | model = PostDocument
19 |
20 | def set_extra_driver_options(self, options) -> None:
21 | options.add_experimental_option("detach", True)
22 |
23 | def extract(self, link: str, **kwargs):
24 | logger.info(f"Starting scrapping data for profile: {link}")
25 |
26 | self.login()
27 |
28 | soup = self._get_page_content(link)
29 |
30 | data = {
31 | "Name": self._scrape_section(soup, "h1", class_="text-heading-xlarge"),
32 | "About": self._scrape_section(soup, "div", class_="display-flex ph5 pv3"),
33 | "Main Page": self._scrape_section(soup, "div", {"id": "main-content"}),
34 | "Experience": self._scrape_experience(link),
35 | "Education": self._scrape_education(link),
36 | }
37 |
38 | self.driver.get(link)
39 | time.sleep(5)
40 | button = self.driver.find_element(
41 | By.CSS_SELECTOR,
42 | ".app-aware-link.profile-creator-shared-content-view__footer-action",
43 | )
44 | button.click()
45 |
46 | # Scrolling and scraping posts
47 | self.scroll_page()
48 | soup = BeautifulSoup(self.driver.page_source, "html.parser")
49 | post_elements = soup.find_all(
50 | "div",
51 | class_="update-components-text relative update-components-update-v2__commentary",
52 | )
53 | buttons = soup.find_all("button", class_="update-components-image__image-link")
54 | post_images = self._extract_image_urls(buttons)
55 |
56 | posts = self._extract_posts(post_elements, post_images)
57 | logger.info(f"Found {len(posts)} posts for profile: {link}")
58 |
59 | self.driver.close()
60 |
61 | self.model.bulk_insert(
62 | [
63 | PostDocument(
64 | platform="linkedin", content=post, author_id=kwargs.get("user")
65 | )
66 | for post in posts
67 | ]
68 | )
69 |
70 | logger.info(f"Finished scrapping data for profile: {link}")
71 |
72 | def _scrape_section(self, soup: BeautifulSoup, *args, **kwargs) -> str:
73 | """Scrape a specific section of the LinkedIn profile."""
74 | # Example: Scrape the 'About' section
75 | parent_div = soup.find(*args, **kwargs)
76 | return parent_div.get_text(strip=True) if parent_div else ""
77 |
78 | def _extract_image_urls(self, buttons: List[Tag]) -> Dict[str, str]:
79 | """
80 | Extracts image URLs from button elements.
81 |
82 | Args:
83 | buttons (List[Tag]): A list of BeautifulSoup Tag objects representing buttons.
84 |
85 | Returns:
86 | Dict[str, str]: A dictionary mapping post indexes to image URLs.
87 | """
88 | post_images = {}
89 | for i, button in enumerate(buttons):
90 | img_tag = button.find("img")
91 | if img_tag and "src" in img_tag.attrs:
92 | post_images[f"Post_{i}"] = img_tag["src"]
93 | else:
94 | logger.warning("No image found in this button")
95 | return post_images
96 |
97 | def _get_page_content(self, url: str) -> BeautifulSoup:
98 | """Retrieve the page content of a given URL."""
99 | self.driver.get(url)
100 | time.sleep(5)
101 | return BeautifulSoup(self.driver.page_source, "html.parser")
102 |
103 | def _extract_posts(
104 | self, post_elements: List[Tag], post_images: Dict[str, str]
105 | ) -> Dict[str, Dict[str, str]]:
106 | """
107 | Extracts post texts and combines them with their respective images.
108 |
109 | Args:
110 | post_elements (List[Tag]): A list of BeautifulSoup Tag objects representing post elements.
111 | post_images (Dict[str, str]): A dictionary containing image URLs mapped by post index.
112 |
113 | Returns:
114 | Dict[str, Dict[str, str]]: A dictionary containing post data with text and optional image URL.
115 | """
116 | posts_data = {}
117 | for i, post_element in enumerate(post_elements):
118 | post_text = post_element.get_text(strip=True, separator="\n")
119 | post_data = {"text": post_text}
120 | if f"Post_{i}" in post_images:
121 | post_data["image"] = post_images[f"Post_{i}"]
122 | posts_data[f"Post_{i}"] = post_data
123 | return posts_data
124 |
125 | def _scrape_experience(self, profile_url: str) -> str:
126 | """Scrapes the Experience section of the LinkedIn profile."""
127 | self.driver.get(profile_url + "/details/experience/")
128 | time.sleep(5)
129 | soup = BeautifulSoup(self.driver.page_source, "html.parser")
130 | experience_content = soup.find("section", {"id": "experience-section"})
131 | return experience_content.get_text(strip=True) if experience_content else ""
132 |
133 | def _scrape_education(self, profile_url: str) -> str:
134 | self.driver.get(profile_url + "/details/education/")
135 | time.sleep(5)
136 | soup = BeautifulSoup(self.driver.page_source, "html.parser")
137 | education_content = soup.find("section", {"id": "education-section"})
138 | return education_content.get_text(strip=True) if education_content else ""
139 |
140 | def login(self):
141 | """Log in to LinkedIn."""
142 | self.driver.get("https://www.linkedin.com/login")
143 | if not settings.LINKEDIN_USERNAME and not settings.LINKEDIN_PASSWORD:
144 | raise ImproperlyConfigured(
145 | "LinkedIn scraper requires an valid account to perform extraction"
146 | )
147 |
148 | self.driver.find_element(By.ID, "username").send_keys(
149 | settings.LINKEDIN_USERNAME
150 | )
151 | self.driver.find_element(By.ID, "password").send_keys(
152 | settings.LINKEDIN_PASSWORD
153 | )
154 | self.driver.find_element(
155 | By.CSS_SELECTOR, ".login__form_action_container button"
156 | ).click()
157 |
--------------------------------------------------------------------------------
/data-ingestion/cdc.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 |
4 | from bson import json_util
5 | from mq import publish_to_rabbitmq
6 |
7 | from config import settings
8 | from db import MongoDatabaseConnector
9 |
10 | # Configure logging
11 | logging.basicConfig(
12 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
13 | )
14 |
15 |
16 | def stream_process():
17 | try:
18 | # Setup MongoDB connection
19 | client = MongoDatabaseConnector()
20 | db = client["scrabble"]
21 | logging.info("Connected to MongoDB.")
22 |
23 | # Watch changes in a specific collection
24 | changes = db.watch([{"$match": {"operationType": {"$in": ["insert"]}}}])
25 | for change in changes:
26 | data_type = change["ns"]["coll"]
27 | entry_id = str(change["fullDocument"]["_id"]) # Convert ObjectId to string
28 | change["fullDocument"].pop("_id")
29 | change["fullDocument"]["type"] = data_type
30 | change["fullDocument"]["entry_id"] = entry_id
31 |
32 | # Use json_util to serialize the document
33 | data = json.dumps(change["fullDocument"], default=json_util.default)
34 | logging.info(f"Change detected and serialized: {data}")
35 |
36 | # Send data to rabbitmq
37 | publish_to_rabbitmq(queue_name=settings.RABBITMQ_QUEUE_NAME, data=data)
38 | logging.info("Data published to RabbitMQ.")
39 |
40 | except Exception as e:
41 | logging.error(f"An error occurred: {e}")
42 |
43 |
44 | if __name__ == "__main__":
45 | stream_process()
46 |
--------------------------------------------------------------------------------
/data-ingestion/db.py:
--------------------------------------------------------------------------------
1 | from pymongo import MongoClient
2 | from pymongo.errors import ConnectionFailure
3 |
4 | from config import settings
5 |
6 |
7 | class MongoDatabaseConnector:
8 | """Singleton class to connect to MongoDB database."""
9 |
10 | _instance: MongoClient = None
11 |
12 | def __new__(cls, *args, **kwargs):
13 | if cls._instance is None:
14 | try:
15 | cls._instance = MongoClient(settings.MONGO_DATABASE_HOST)
16 | except ConnectionFailure as e:
17 | print(f"Couldn't connect to the database: {str(e)}")
18 | raise
19 |
20 | print(
21 | f"Connection to database with uri: {settings.MONGO_DATABASE_HOST} successful"
22 | )
23 | return cls._instance
24 |
25 | def get_database(self):
26 | return self._instance[settings.MONGO_DATABASE_NAME]
27 |
28 | def close(self):
29 | if self._instance:
30 | self._instance.close()
31 | print("Connected to database has been closed.")
32 |
33 |
34 | connection = MongoDatabaseConnector()
35 |
--------------------------------------------------------------------------------
/data-ingestion/mq.py:
--------------------------------------------------------------------------------
1 | import pika
2 |
3 | from config import settings
4 |
5 |
6 | class RabbitMQConnection:
7 | """Singleton class to manage RabbitMQ connection."""
8 |
9 | _instance = None
10 |
11 | def __new__(
12 | cls,
13 | host: str = None,
14 | port: int = None,
15 | username: str = None,
16 | password: str = None,
17 | virtual_host: str = "/",
18 | ):
19 | if not cls._instance:
20 | cls._instance = super().__new__(cls)
21 | return cls._instance
22 |
23 | def __init__(
24 | self,
25 | host: str = None,
26 | port: int = None,
27 | username: str = None,
28 | password: str = None,
29 | virtual_host: str = "/",
30 | fail_silently: bool = False,
31 | **kwargs,
32 | ):
33 | self.host = host or settings.RABBITMQ_HOST
34 | self.port = port or settings.RABBITMQ_PORT
35 | self.username = username or settings.RABBITMQ_DEFAULT_USERNAME
36 | self.password = password or settings.RABBITMQ_DEFAULT_PASSWORD
37 | self.virtual_host = virtual_host
38 | self.fail_silently = fail_silently
39 | self._connection = None
40 |
41 | def __enter__(self):
42 | self.connect()
43 | return self
44 |
45 | def __exit__(self, exc_type, exc_val, exc_tb):
46 | self.close()
47 |
48 | def connect(self):
49 | try:
50 | credentials = pika.PlainCredentials(self.username, self.password)
51 | self._connection = pika.BlockingConnection(
52 | pika.ConnectionParameters(
53 | host=self.host,
54 | port=self.port,
55 | virtual_host=self.virtual_host,
56 | credentials=credentials,
57 | )
58 | )
59 | except pika.exceptions.AMQPConnectionError as e:
60 | print("Failed to connect to RabbitMQ:", e)
61 | if not self.fail_silently:
62 | raise e
63 |
64 | def is_connected(self) -> bool:
65 | return self._connection is not None and self._connection.is_open
66 |
67 | def get_channel(self):
68 | if self.is_connected():
69 | return self._connection.channel()
70 |
71 | def close(self):
72 | if self.is_connected():
73 | self._connection.close()
74 | self._connection = None
75 | print("Closed RabbitMQ connection")
76 |
77 |
78 | def publish_to_rabbitmq(queue_name: str, data: str):
79 | """Publish data to a RabbitMQ queue."""
80 | try:
81 | # Create an instance of RabbitMQConnection
82 | rabbitmq_conn = RabbitMQConnection()
83 |
84 | # Establish connection
85 | with rabbitmq_conn:
86 | channel = rabbitmq_conn.get_channel()
87 |
88 | # Ensure the queue exists
89 | channel.queue_declare(queue=queue_name, durable=True)
90 |
91 | # Delivery confirmation
92 | channel.confirm_delivery()
93 |
94 | # Send data to the queue
95 | channel.basic_publish(
96 | exchange="",
97 | routing_key=queue_name,
98 | body=data,
99 | properties=pika.BasicProperties(
100 | delivery_mode=2, # make message persistent
101 | ),
102 | )
103 | print("Sent data to RabbitMQ:", data)
104 | except pika.exceptions.UnroutableError:
105 | print("Message could not be routed")
106 | except Exception as e:
107 | print(f"Error publishing to RabbitMQ: {e}")
108 |
109 |
110 | if __name__ == "__main__":
111 | publish_to_rabbitmq("test_queue", "Hello, World!")
112 |
--------------------------------------------------------------------------------
/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/J-coder118/LLM-Twin/707f9d8bb1cf402e04644bff9c5c521ce0938087/db/__init__.py
--------------------------------------------------------------------------------
/db/documents.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | from typing import List, Optional
3 |
4 | from errors import ImproperlyConfigured
5 | from pydantic import UUID4, BaseModel, ConfigDict, Field
6 | from pymongo import errors
7 | from utils import get_logger
8 |
9 | from db.mongo import connection
10 |
11 | _database = connection.get_database("scrabble")
12 |
13 | logger = get_logger(__name__)
14 |
15 |
16 | class BaseDocument(BaseModel):
17 | id: UUID4 = Field(default_factory=uuid.uuid4)
18 |
19 | model_config = ConfigDict(from_attributes=True, populate_by_name=True)
20 |
21 | @classmethod
22 | def from_mongo(cls, data: dict):
23 | """Convert "_id" (str object) into "id" (UUID object)."""
24 | if not data:
25 | return data
26 |
27 | id = data.pop("_id", None)
28 | return cls(**dict(data, id=id))
29 |
30 | def to_mongo(self, **kwargs) -> dict:
31 | """Convert "id" (UUID object) into "_id" (str object)."""
32 | exclude_unset = kwargs.pop("exclude_unset", False)
33 | by_alias = kwargs.pop("by_alias", True)
34 |
35 | parsed = self.model_dump(
36 | exclude_unset=exclude_unset, by_alias=by_alias, **kwargs
37 | )
38 |
39 | if "_id" not in parsed and "id" in parsed:
40 | parsed["_id"] = str(parsed.pop("id"))
41 |
42 | return parsed
43 |
44 | def save(self, **kwargs):
45 | collection = _database[self._get_collection_name()]
46 |
47 | try:
48 | result = collection.insert_one(self.to_mongo(**kwargs))
49 | return result.inserted_id
50 | except errors.WriteError:
51 | logger.exception("Failed to insert document.")
52 |
53 | return None
54 |
55 | @classmethod
56 | def get_or_create(cls, **filter_options) -> Optional[str]:
57 | collection = _database[cls._get_collection_name()]
58 | try:
59 | instance = collection.find_one(filter_options)
60 | if instance:
61 | return str(cls.from_mongo(instance).id)
62 | new_instance = cls(**filter_options)
63 | new_instance = new_instance.save()
64 | return new_instance
65 | except errors.OperationFailure:
66 | logger.exception("Failed to retrieve or create document.")
67 |
68 | return None
69 |
70 | @classmethod
71 | def bulk_insert(cls, documents: List, **kwargs) -> Optional[List[str]]:
72 | collection = _database[cls._get_collection_name()]
73 | try:
74 | result = collection.insert_many(
75 | [doc.to_mongo(**kwargs) for doc in documents]
76 | )
77 | return result.inserted_ids
78 | except errors.WriteError:
79 | logger.exception("Failed to insert documents.")
80 |
81 | return None
82 |
83 | @classmethod
84 | def _get_collection_name(cls):
85 | if not hasattr(cls, "Settings") or not hasattr(cls.Settings, "name"):
86 | raise ImproperlyConfigured(
87 | "Document should define an Settings configuration class with the name of the collection."
88 | )
89 |
90 | return cls.Settings.name
91 |
92 |
93 | class UserDocument(BaseDocument):
94 | first_name: str
95 | last_name: str
96 |
97 | class Settings:
98 | name = "users"
99 |
100 |
101 | class RepositoryDocument(BaseDocument):
102 | name: str
103 | link: str
104 | content: dict
105 | owner_id: str = Field(alias="owner_id")
106 |
107 | class Settings:
108 | name = "repositories"
109 |
110 |
111 | class PostDocument(BaseDocument):
112 | platform: str
113 | content: dict
114 | author_id: str = Field(alias="author_id")
115 |
116 | class Settings:
117 | name = "posts"
118 |
119 |
120 | class ArticleDocument(BaseDocument):
121 | platform: str
122 | link: str
123 | content: dict
124 | author_id: str = Field(alias="author_id")
125 |
126 | class Settings:
127 | name = "articles"
128 |
--------------------------------------------------------------------------------
/db/mongo.py:
--------------------------------------------------------------------------------
1 | from pymongo import MongoClient
2 | from pymongo.errors import ConnectionFailure
3 |
4 | from config import settings
5 |
6 |
7 | class MongoDatabaseConnector:
8 | """Singleton class to connect to MongoDB database."""
9 |
10 | _instance: MongoClient = None
11 |
12 | def __new__(cls, *args, **kwargs):
13 | if cls._instance is None:
14 | try:
15 | cls._instance = MongoClient(settings.MONGO_DATABASE_HOST)
16 | except ConnectionFailure as e:
17 | print(f"Couldn't connect to the database: {str(e)}")
18 | raise
19 |
20 | print(
21 | f"Connection to database with uri: {settings.MONGO_DATABASE_HOST} successful"
22 | )
23 | return cls._instance
24 |
25 | def get_database(self):
26 | return self._instance[settings.MONGO_DATABASE_NAME]
27 |
28 | def close(self):
29 | if self._instance:
30 | self._instance.close()
31 | print("Connected to database has been closed.")
32 |
33 |
34 | connection = MongoDatabaseConnector()
35 |
--------------------------------------------------------------------------------
/dispatcher.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from crawlers.base import BaseCrawler
4 |
5 |
6 | class CrawlerDispatcher:
7 | def __init__(self) -> None:
8 | self._crawlers = {}
9 |
10 | def register(self, domain: str, crawler: type[BaseCrawler]) -> None:
11 | self._crawlers[r"https://(www\.)?{}.com/*".format(re.escape(domain))] = crawler
12 |
13 | def get_crawler(self, url: str) -> BaseCrawler:
14 | for pattern, crawler in self._crawlers.items():
15 | if re.match(pattern, url):
16 | return crawler()
17 | else:
18 | raise ValueError("No crawler found for the provided link")
19 |
--------------------------------------------------------------------------------
/docker-bake.hcl:
--------------------------------------------------------------------------------
1 | group "default" {
2 | targets = ["bytewax", "cdc"]
3 | }
4 |
5 | target "bytewax" {
6 | context = "."
7 | dockerfile = ".docker/Dockerfile.bytewax"
8 | }
9 |
10 | target "cdc" {
11 | context = "."
12 | dockerfile = ".docker/Dockerfile.cdc"
13 | }
14 |
--------------------------------------------------------------------------------
/docker-compose-superlinked.yml:
--------------------------------------------------------------------------------
1 | version: '3.8'
2 |
3 | services:
4 | mongo1:
5 | image: mongo:5
6 | container_name: llm-twin-mongo1
7 | command: ["--replSet", "my-replica-set", "--bind_ip_all", "--port", "30001"]
8 | volumes:
9 | - mongo-replica-1-data:/data/db
10 | ports:
11 | - "30001:30001"
12 | healthcheck:
13 | test: test $$(echo "rs.initiate({_id:'my-replica-set',members:[{_id:0,host:\"mongo1:30001\"},{_id:1,host:\"mongo2:30002\"},{_id:2,host:\"mongo3:30003\"}]}).ok || rs.status().ok" | mongo --port 30001 --quiet) -eq 1
14 | interval: 10s
15 | start_period: 30s
16 | restart: always
17 | networks:
18 | - server_default
19 |
20 | mongo2:
21 | image: mongo:5
22 | container_name: llm-twin-mongo2
23 | command: ["--replSet", "my-replica-set", "--bind_ip_all", "--port", "30002"]
24 | volumes:
25 | - mongo-replica-2-data:/data/db
26 | ports:
27 | - "30002:30002"
28 | restart: always
29 | networks:
30 | - server_default
31 |
32 | mongo3:
33 | image: mongo:5
34 | container_name: llm-twin-mongo3
35 | command: ["--replSet", "my-replica-set", "--bind_ip_all", "--port", "30003"]
36 | volumes:
37 | - mongo-replica-3-data:/data/db
38 | ports:
39 | - "30003:30003"
40 | restart: always
41 | networks:
42 | - server_default
43 |
44 | mq:
45 | image: rabbitmq:3-management-alpine
46 | container_name: llm-twin-mq
47 | ports:
48 | - "5672:5672"
49 | - "15672:15672"
50 | volumes:
51 | - ~/rabbitmq/data/:/var/lib/rabbitmq/
52 | - ~/rabbitmq/log/:/var/log/rabbitmq
53 | healthcheck:
54 | test: ["CMD", "rabbitmqctl", "ping"]
55 | interval: 30s
56 | timeout: 10s
57 | retries: 5
58 | restart: always
59 | networks:
60 | - server_default
61 |
62 | crawler:
63 | image: "llm-twin-crawler"
64 | container_name: llm-twin-crawler
65 | platform: "linux/amd64"
66 | build:
67 | context: .
68 | dockerfile: .docker/Dockerfile.crawlers
69 | env_file:
70 | - .env
71 | ports:
72 | - "9010:8080"
73 | depends_on:
74 | - mongo1
75 | - mongo2
76 | - mongo3
77 | networks:
78 | - server_default
79 |
80 | cdc:
81 | image: "llm-twin-cdc"
82 | container_name: llm-twin-cdc
83 | build:
84 | context: .
85 | dockerfile: .docker/Dockerfile.cdc
86 | env_file:
87 | - .env
88 | depends_on:
89 | - mongo1
90 | - mongo2
91 | - mongo3
92 | - mq
93 | networks:
94 | - server_default
95 |
96 | bytewax:
97 | image: "llm-twin-bytewax-superlinked"
98 | container_name: llm-twin-bytewax-superlinked
99 | build:
100 | context: .
101 | dockerfile: .docker/Dockerfile.bytewax.superlinked
102 | environment:
103 | BYTEWAX_PYTHON_FILE_PATH: "main:flow"
104 | DEBUG: "false"
105 | BYTEWAX_KEEP_CONTAINER_ALIVE: "false"
106 | env_file:
107 | - .env
108 | depends_on:
109 | - mq
110 | restart: on-failure
111 | networks:
112 | - server_default
113 |
114 | volumes:
115 | mongo-replica-1-data:
116 | mongo-replica-2-data:
117 | mongo-replica-3-data:
118 |
119 | networks:
120 | server_default:
121 | external: true
122 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.8'
2 |
3 | services:
4 | mongo1:
5 | image: mongo:5
6 | container_name: llm-twin-mongo1
7 | command: ["--replSet", "my-replica-set", "--bind_ip_all", "--port", "30001"]
8 | volumes:
9 | - mongo-replica-1-data:/data/db
10 | ports:
11 | - "30001:30001"
12 | healthcheck:
13 | test: test $$(echo "rs.initiate({_id:'my-replica-set',members:[{_id:0,host:\"mongo1:30001\"},{_id:1,host:\"mongo2:30002\"},{_id:2,host:\"mongo3:30003\"}]}).ok || rs.status().ok" | mongo --port 30001 --quiet) -eq 1
14 | interval: 10s
15 | start_period: 30s
16 | restart: always
17 |
18 | mongo2:
19 | image: mongo:5
20 | container_name: llm-twin-mongo2
21 | command: ["--replSet", "my-replica-set", "--bind_ip_all", "--port", "30002"]
22 | volumes:
23 | - mongo-replica-2-data:/data/db
24 | ports:
25 | - "30002:30002"
26 | restart: always
27 |
28 | mongo3:
29 | image: mongo:5
30 | container_name: llm-twin-mongo3
31 | command: ["--replSet", "my-replica-set", "--bind_ip_all", "--port", "30003"]
32 | volumes:
33 | - mongo-replica-3-data:/data/db
34 | ports:
35 | - "30003:30003"
36 | restart: always
37 |
38 | mq:
39 | image: rabbitmq:3-management-alpine
40 | container_name: llm-twin-mq
41 | ports:
42 | - "5673:5672"
43 | - "15673:15672"
44 | volumes:
45 | - ~/rabbitmq/data/:/var/lib/rabbitmq/
46 | - ~/rabbitmq/log/:/var/log/rabbitmq
47 | restart: always
48 |
49 | qdrant:
50 | image: qdrant/qdrant:latest
51 | container_name: llm-twin-qdrant
52 | ports:
53 | - "6333:6333"
54 | - "6334:6334"
55 | expose:
56 | - "6333"
57 | - "6334"
58 | - "6335"
59 | volumes:
60 | - qdrant-data:/qdrant_data
61 | restart: always
62 |
63 | crawler:
64 | image: "llm-twin-crawler"
65 | container_name: llm-twin-crawler
66 | platform: "linux/amd64"
67 | build:
68 | context: .
69 | dockerfile: .docker/Dockerfile.crawlers
70 | env_file:
71 | - .env
72 | ports:
73 | - "9010:8080"
74 | depends_on:
75 | - mongo1
76 | - mongo2
77 | - mongo3
78 |
79 | cdc:
80 | image: "llm-twin-cdc"
81 | container_name: llm-twin-cdc
82 | build:
83 | context: .
84 | dockerfile: .docker/Dockerfile.cdc
85 | env_file:
86 | - .env
87 | depends_on:
88 | - mongo1
89 | - mongo2
90 | - mongo3
91 | - mq
92 |
93 | bytewax:
94 | image: "llm-twin-bytewax"
95 | container_name: llm-twin-bytewax
96 | build:
97 | context: .
98 | dockerfile: .docker/Dockerfile.bytewax
99 | environment:
100 | BYTEWAX_PYTHON_FILE_PATH: "main:flow"
101 | DEBUG: "false"
102 | BYTEWAX_KEEP_CONTAINER_ALIVE: "true"
103 | env_file:
104 | - .env
105 | depends_on:
106 | - mq
107 | - qdrant
108 | restart: on-failure
109 |
110 | volumes:
111 | mongo-replica-1-data:
112 | mongo-replica-2-data:
113 | mongo-replica-3-data:
114 | qdrant-data:
115 |
--------------------------------------------------------------------------------
/errors.py:
--------------------------------------------------------------------------------
1 | class ScrabbleException(Exception):
2 | pass
3 |
4 |
5 | class ImproperlyConfigured(ScrabbleException):
6 | pass
7 |
--------------------------------------------------------------------------------
/lib.py:
--------------------------------------------------------------------------------
1 | from errors import ImproperlyConfigured
2 |
3 |
4 | def user_to_names(user: str | None) -> tuple[str, str]:
5 | if user is None:
6 | raise ImproperlyConfigured("User name is empty")
7 |
8 | name_tokens = user.split(" ")
9 | if len(name_tokens) == 0:
10 | raise ImproperlyConfigured("User name is empty")
11 | elif len(name_tokens) == 1:
12 | first_name, last_name = name_tokens[0], name_tokens[0]
13 | else:
14 | first_name, last_name = " ".join(name_tokens[:-1]), name_tokens[-1]
15 |
16 | return first_name, last_name
17 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from aws_lambda_powertools import Logger
4 | from aws_lambda_powertools.utilities.typing import LambdaContext
5 |
6 | import lib
7 | from crawlers import GithubCrawler, LinkedInCrawler, MediumCrawler
8 | from db.documents import UserDocument
9 | from dispatcher import CrawlerDispatcher
10 |
11 | logger = Logger(service="decodingml/crawler")
12 |
13 | _dispatcher = CrawlerDispatcher()
14 | _dispatcher.register("medium", MediumCrawler)
15 | _dispatcher.register("linkedin", LinkedInCrawler)
16 | _dispatcher.register("github", GithubCrawler)
17 |
18 |
19 | def handler(event, context: LambdaContext) -> dict[str, Any]:
20 | first_name, last_name = lib.user_to_names(event.get("user"))
21 |
22 | user = UserDocument.get_or_create(first_name=first_name, last_name=last_name)
23 |
24 | link = event.get("link")
25 | crawler = _dispatcher.get_crawler(link)
26 |
27 | try:
28 | crawler.extract(link=link, user=user)
29 |
30 | return {"statusCode": 200, "body": "Link processed successfully"}
31 | except Exception as e:
32 | return {"statusCode": 500, "body": f"An error occurred: {str(e)}"}
33 |
34 |
35 | if __name__ == "__main__":
36 | event = {
37 | "user": "Paul Iuztin",
38 | "link": "https://www.linkedin.com/in/vesaalexandru/",
39 | }
40 | handler(event, None)
41 |
--------------------------------------------------------------------------------
/ops/.gitignore:
--------------------------------------------------------------------------------
1 | /bin/
2 | /node_modules/
3 |
--------------------------------------------------------------------------------
/ops/Pulumi.yaml:
--------------------------------------------------------------------------------
1 | name: decodingml
2 | runtime: nodejs
3 | description: AWS Cloud Infrastructure for Twin LMM Course
4 | config:
5 | pulumi:tags:
6 | value:
7 | pulumi:template: ""
8 |
--------------------------------------------------------------------------------
/ops/components/cdc.ts:
--------------------------------------------------------------------------------
1 | //TBD
--------------------------------------------------------------------------------
/ops/components/config.ts:
--------------------------------------------------------------------------------
1 | export const SubnetCidrBlocks = {
2 | Internet: '0.0.0.0/0',
3 | VPC: '10.0.0.0/16',
4 | PublicOne: '10.0.0.0/20',
5 | PublicTwo: '10.0.16.0/20',
6 | } as const;
7 |
--------------------------------------------------------------------------------
/ops/components/crawler.ts:
--------------------------------------------------------------------------------
1 | import * as pulumi from "@pulumi/pulumi";
2 | import * as aws from "@pulumi/aws";
3 |
4 | export interface CrawlerProps {
5 | vpcId: pulumi.Input
6 | timeout: pulumi.Input
7 | memory: pulumi.Input
8 | }
9 |
10 | export class Crawler extends pulumi.ComponentResource {
11 | public readonly arn: pulumi.Output
12 |
13 | constructor (
14 | name: string,
15 | props: CrawlerProps,
16 | opts?: pulumi.ComponentResourceOptions,
17 | ) {
18 | super("decodingml:main:Crawler", name, {}, opts);
19 |
20 | const accountId = pulumi.output(aws.getCallerIdentity()).accountId;
21 | const region = pulumi.output(aws.getRegion()).name;
22 |
23 | const lambdaExecutionRole = new aws.iam.Role(`${name}-role`, {
24 | assumeRolePolicy: JSON.stringify({
25 | Version: "2012-10-17",
26 | Statement: [{
27 | Effect: "Allow",
28 | Principal: {
29 | Service: "lambda.amazonaws.com",
30 | },
31 | Action: "sts:AssumeRole",
32 | }],
33 | }),
34 | managedPolicyArns: [
35 | aws.iam.ManagedPolicy.AmazonS3FullAccess,
36 | aws.iam.ManagedPolicy.AmazonDocDBFullAccess,
37 | aws.iam.ManagedPolicy.AWSLambdaBasicExecutionRole,
38 | aws.iam.ManagedPolicy.AWSLambdaVPCAccessExecutionRole,
39 | aws.iam.ManagedPolicy.CloudWatchLambdaInsightsExecutionRolePolicy
40 | ]
41 | })
42 |
43 | const sg = new aws.ec2.SecurityGroup(`${name}-security-group`, {
44 | name: `${name}-sg`,
45 | description: "Crawler Lambda Access",
46 | vpcId: props.vpcId,
47 | egress: [{
48 | protocol: "-1",
49 | description: "Allow all outbound traffic by default",
50 | fromPort: 0,
51 | toPort: 0,
52 | cidrBlocks: ["0.0.0.0/0"],
53 | }],
54 | tags: {
55 | Name: `${name}-sg`
56 | }
57 | })
58 |
59 | const lambdaFunction = new aws.lambda.Function(`${name}-lambda-function`, {
60 | name: `${name}`,
61 | imageUri: pulumi.interpolate`${accountId}.dkr.ecr.${region}.amazonaws.com/crawler:latest`,
62 | packageType: 'Image',
63 | description: 'Crawler Lambda Function',
64 | timeout: props.timeout,
65 | memorySize: props.memory,
66 | role: lambdaExecutionRole.arn,
67 | vpcConfig: {
68 | subnetIds: pulumi.output(aws.ec2.getSubnets({tags: {Type: 'public'}})).ids,
69 | securityGroupIds: [sg.id],
70 | }
71 | }, {dependsOn: lambdaExecutionRole})
72 |
73 | this.arn = lambdaFunction.arn
74 |
75 | this.registerOutputs({
76 | arn: this.arn
77 | })
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/ops/components/docdb.ts:
--------------------------------------------------------------------------------
1 | import * as pulumi from "@pulumi/pulumi";
2 | import * as aws from "@pulumi/aws";
3 |
4 |
5 | export interface DocumentDBClusterProps {
6 | vpcId: pulumi.Input
7 | instanceClass?: pulumi.Input
8 | multiAZ?: pulumi.Input
9 | port?: pulumi.Input
10 |
11 | backupRetentionPeriod?: pulumi.Input
12 | }
13 |
14 | export class DocumentDBCluster extends pulumi.ComponentResource {
15 |
16 | constructor (
17 | name: string,
18 | props: DocumentDBClusterProps,
19 | opts?: pulumi.ComponentResourceOptions,
20 | ) {
21 | super("decodingml:main:DocumentDBCluster", name, {}, opts);
22 |
23 |
24 | const subnetGroup = new aws.docdb.SubnetGroup(`${name}-docdb-subnet-group`, {
25 | name: `${name}-cluster-subnet-group`,
26 | description: `VPC subnet group for the ${name}-cluster`,
27 | subnetIds: pulumi.output(aws.ec2.getSubnets({tags: {Type: 'public'}})).ids,
28 | tags: {
29 | Name: `${name}-cluster-subnet-group`
30 | }
31 | }, {parent: this})
32 |
33 | const securityGroup = new aws.ec2.SecurityGroup(`${name}-docdb-sg`, {
34 | name: `${name}-docdb-cluster-sg`,
35 | description: "Database access",
36 | vpcId: props.vpcId,
37 | tags: {
38 | Name: `${name}-docdb-cluster-sg`
39 | },
40 | ingress: [
41 | {
42 | description: "Ingress from anywhere",
43 | fromPort: props.port || 27017,
44 | toPort: props.port || 27017,
45 | protocol: "-1",
46 | },
47 | ],
48 | egress: [{
49 | protocol: "-1",
50 | description: "Allow all outbound traffic by default",
51 | fromPort: 0,
52 | toPort: 0,
53 | cidrBlocks: ["0.0.0.0/0"],
54 | }],
55 | }, {parent: this})
56 |
57 | const cluster = new aws.docdb.Cluster(`${name}-docdb-cluster`, {
58 | // availabilityZones: pulumi.output(aws.getAvailabilityZones({state: "available"}) if props.multiAZ else
59 | backupRetentionPeriod: props.backupRetentionPeriod || 7,
60 | clusterIdentifier: `${name}-cluster`,
61 | masterUsername: pulumi.output(aws.ssm.getParameter({ name: `/${name}/cluster/master/username` })).value,
62 | masterPassword: pulumi.output(aws.ssm.getParameter({ name: `/${name}/cluster/master/password` })).value,
63 | engineVersion: "5.0.0",
64 | port: props.port || 27017,
65 | dbSubnetGroupName: subnetGroup.name,
66 | storageEncrypted: true,
67 | skipFinalSnapshot: true,
68 | vpcSecurityGroupIds: [ securityGroup.id ],
69 | tags: {
70 | Name: `${name}-cluster`
71 | }
72 | }, {parent: this})
73 |
74 | const primaryInstance = new aws.docdb.ClusterInstance(`${name}-docdb-primary-instance`, {
75 | clusterIdentifier: cluster.clusterIdentifier,
76 | identifier: `${name}-primary-instance`,
77 | instanceClass: props.instanceClass || "db.t3.medium",
78 | tags: {
79 | Name: `${name}-primary-instance`
80 | }
81 | }, {parent: this})
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/ops/components/ecs/cluster.ts:
--------------------------------------------------------------------------------
1 | import * as pulumi from "@pulumi/pulumi";
2 | import * as aws from "@pulumi/aws";
3 |
4 |
5 | export interface ECSClusterProps {
6 | vpcId: pulumi.Input
7 | }
8 |
9 | export class ECSCluster extends pulumi.ComponentResource {
10 | name: pulumi.Output
11 |
12 | constructor (
13 | name: string,
14 | props: ECSClusterProps,
15 | opts?: pulumi.ComponentResourceOptions,
16 | ) {
17 | super("decodingml:main:ECSCluster", name, {}, opts);
18 |
19 | const cluster = new aws.ecs.Cluster(`${name}-cluster`, {
20 | name: `${name}-cluster`,
21 | }, {parent: this})
22 |
23 | this.name = cluster.name
24 |
25 | const securityGroup = new aws.ec2.SecurityGroup(`${name}-sg`, {
26 | name: `${name}-ecs-host-sg`,
27 | description: 'Access to the ECS hosts that run containers',
28 | vpcId: props.vpcId,
29 | ingress: [
30 | {
31 | description: "Ingress from other containers in the same security group",
32 | fromPort: 0,
33 | toPort: 0,
34 | protocol: "-1",
35 | self: true,
36 | }
37 | ],
38 | egress: [
39 | {
40 | cidrBlocks: ['0.0.0.0/0'],
41 | description: "Allow all outbound traffic by default",
42 | protocol: "-1",
43 | fromPort: 0,
44 | toPort: 0,
45 | },
46 | ],
47 | tags: {
48 | Name: `${name}-ecs-host-sg`
49 | }
50 | }, {parent: this})
51 |
52 | new aws.servicediscovery.PrivateDnsNamespace(`${name}-private-dns-namespace`, {
53 | name: `${name}.internal`,
54 | vpc: props.vpcId,
55 | }, {parent: this})
56 |
57 | this.registerOutputs({
58 | name: this.name
59 | })
60 |
61 | }
62 | }
--------------------------------------------------------------------------------
/ops/components/ecs/iam.ts:
--------------------------------------------------------------------------------
1 | import * as aws from "@pulumi/aws";
2 |
3 |
4 | export const ecsRole = new aws.iam.Role("ecs-role", {
5 | name: `ecs-role`,
6 | assumeRolePolicy: aws.iam.assumeRolePolicyForPrincipal({ Service: "ecs.amazonaws.com" }),
7 | path: "/",
8 | inlinePolicies: [{
9 | name: "ecs-service",
10 | policy: JSON.stringify({
11 | Statement: [{
12 | Action: [
13 | 'ec2:AttachNetworkInterface',
14 | 'ec2:CreateNetworkInterface',
15 | 'ec2:CreateNetworkInterfacePermission',
16 | 'ec2:DeleteNetworkInterface',
17 | 'ec2:DeleteNetworkInterfacePermission',
18 | 'ec2:Describe*',
19 | 'ec2:DetachNetworkInterface',
20 | 'elasticloadbalancing:DeregisterInstancesFromLoadBalancer',
21 | 'elasticloadbalancing:DeregisterTargets',
22 | 'elasticloadbalancing:Describe*',
23 | 'elasticloadbalancing:RegisterInstancesWithLoadBalancer',
24 | 'elasticloadbalancing:RegisterTargets'
25 | ],
26 | Effect: 'Allow',
27 | Resource: '*'
28 | }],
29 | Version: '2012-10-17',
30 | } as aws.iam.PolicyDocument)
31 | }]
32 | })
33 |
34 |
35 | export const ecsTaskExecutionRole = new aws.iam.Role("ecs-task-execution-role", {
36 | name: `ecs-task-execution-role`,
37 | assumeRolePolicy: aws.iam.assumeRolePolicyForPrincipal({ Service: "ecs-tasks.amazonaws.com" }),
38 | path: "/",
39 | inlinePolicies: [
40 | {
41 | name: "ecs-logs",
42 | policy: JSON.stringify({
43 | Statement: [{
44 | Action: [
45 | 'logs:CreateLogGroup'
46 | ],
47 | Effect: 'Allow',
48 | Resource: '*'
49 | }]
50 | } as aws.iam.PolicyDocument),
51 | },
52 | {
53 | name: "ecs-ssm",
54 | policy: JSON.stringify({
55 | Statement: [{
56 | Sid: "readEnvironmentParameters",
57 | Action: [
58 | 'ssm:GetParameters'
59 | ],
60 | Effect: 'Allow',
61 | Resource: "*"
62 | }]
63 | } as aws.iam.PolicyDocument),
64 | }
65 | ],
66 | managedPolicyArns: [
67 | 'arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy'
68 | ]
69 | })
--------------------------------------------------------------------------------
/ops/components/ecs/service.ts:
--------------------------------------------------------------------------------
1 | import * as pulumi from "@pulumi/pulumi";
2 | import * as aws from "@pulumi/aws";
3 |
4 | export interface ContainerSecrets {
5 | name: pulumi.Input;
6 | parameter: pulumi.Input;
7 | }
8 |
9 | export interface ServiceProps {
10 | vpcId: pulumi.Input
11 |
12 | cluster: pulumi.Input;
13 | environment?: pulumi.Input>;
14 | secrets: ContainerSecrets[];
15 |
16 | command?: pulumi.Input;
17 | imageTag?: pulumi.Input;
18 | containerPort: pulumi.Input;
19 | containerCpu?: pulumi.Input;
20 | containerMemory?: pulumi.Input;
21 |
22 | deploymentController?: pulumi.Input;
23 |
24 | desiredCount?: pulumi.Input;
25 | role?: pulumi.Input;
26 | }
27 |
28 | export class Service extends pulumi.ComponentResource {
29 | constructor (
30 | name: string,
31 | props: ServiceProps,
32 | opts?: pulumi.ComponentResourceOptions,
33 | ) {
34 |
35 | super("decodingml:main:Service", name, {}, opts);
36 |
37 | const accountId = pulumi.output(aws.getCallerIdentity()).accountId;
38 | const region = pulumi.output(aws.getRegion()).name;
39 |
40 | const imageUrl = pulumi.interpolate`${accountId}.dkr.ecr.${region}.amazonaws.com/chamberlain:latest`
41 |
42 | const containerSecrets = props.secrets.map(secret => {
43 | return {
44 | name: secret.name,
45 | valueFrom: pulumi.interpolate`arn:aws:ssm:${region}:${accountId}:parameter/${secret.parameter}`
46 | } as aws.ecs.Secret;
47 | })
48 |
49 | const logGroup = new aws.cloudwatch.LogGroup(`log-group`, {
50 | name: `/ecs/${props.cluster}/${name}`,
51 | retentionInDays: 90,
52 | tags: {
53 | Name: `${props.cluster}-${name}-cluster-log-group`
54 | }
55 | })
56 |
57 | const taskDefinition = new aws.ecs.TaskDefinition(`${name}-ecs-task-definition`, {
58 | family: name,
59 | networkMode: 'awsvpc',
60 | requiresCompatibilities: ["FARGATE"],
61 | cpu: props.containerCpu || "512",
62 | memory: props.containerMemory || "1024",
63 | executionRoleArn: pulumi.output(aws.iam.getRole({name: `ecs-task-execution-role`})).arn,
64 | taskRoleArn: props.role,
65 | containerDefinitions: pulumi
66 | .all([imageUrl, props.logGroup, props.environment, containerSecrets])
67 | .apply(([image,logGroup,environment, secrets]) =>
68 | JSON.stringify([{
69 | name: name,
70 | image: image,
71 | portMappings: [{
72 | containerPort: props.containerPort,
73 | }],
74 | command: props.command,
75 | environment: environment,
76 | secrets: secrets,
77 | logConfiguration: {
78 | logDriver: "awslogs",
79 | options: {
80 | "awslogs-group": logGroup,
81 | "awslogs-create-group": "true",
82 | "awslogs-region": "eu-central-1",
83 | "awslogs-stream-prefix": name,
84 | },
85 | },
86 | } as aws.ecs.ContainerDefinition])
87 | )
88 | }, {parent: this})
89 |
90 | const serviceDiscovery = new aws.servicediscovery.Service(`${name}-service-discovery`, {
91 | name: name,
92 | description: `Service discovery for ${name}`,
93 | dnsConfig: {
94 | routingPolicy: "MULTIVALUE",
95 | dnsRecords: [{ type: "A", ttl: 60 }],
96 | namespaceId: pulumi.output(aws.servicediscovery.getDnsNamespace({
97 | name: `streaming.internal`,
98 | type: 'DNS_PRIVATE',
99 | })).id,
100 | },
101 | healthCheckCustomConfig: {
102 | failureThreshold: 1
103 | },
104 | }, {parent: this})
105 |
106 | new aws.ecs.Service(`${name}-ecs-service`, {
107 | name: `${name}-service`,
108 | cluster: props.cluster,
109 | launchType: 'FARGATE',
110 | deploymentController: {
111 | type: props.deploymentController || "ECS",
112 | },
113 | desiredCount: props.desiredCount || 1,
114 | taskDefinition: taskDefinition.arn,
115 | serviceRegistries: {
116 | registryArn: serviceDiscovery.arn,
117 | containerName: `${name}`,
118 | },
119 | networkConfiguration: {
120 | assignPublicIp: false,
121 | securityGroups: pulumi.output(aws.ec2.getSecurityGroups({
122 | tags: {Name: `ecs-host-sg`}
123 | })).ids,
124 | subnets: pulumi.output(aws.ec2.getSubnets({tags: {Type: 'private'}})).ids
125 | }
126 | }, {parent: this})
127 | }
128 | }
--------------------------------------------------------------------------------
/ops/components/mq.ts:
--------------------------------------------------------------------------------
1 | import * as pulumi from "@pulumi/pulumi";
2 | import * as aws from "@pulumi/aws";
3 |
4 |
5 | export interface MessageQueueBrokerProps {
6 | vpcId: pulumi.Input
7 |
8 | engineVersion?: pulumi.Input
9 | instanceType?: pulumi.Input
10 |
11 | }
12 |
13 | export class MessageQueueBroker extends pulumi.ComponentResource {
14 |
15 | constructor(
16 | name: string,
17 | props: MessageQueueBrokerProps,
18 | opts?: pulumi.ComponentResourceOptions,
19 | ) {
20 | super("decodingml:main:MessageQueueBroker", name, {}, opts);
21 |
22 | const accountId = pulumi.output(aws.getCallerIdentity()).accountId;
23 | const region = pulumi.output(aws.getRegion()).name;
24 |
25 | const securityGroup = new aws.ec2.SecurityGroup(`${name}-mq-sg`, {
26 | name: `${name}-mq-sg`,
27 | description: "Message Queue broker access",
28 | vpcId: props.vpcId,
29 | ingress: [
30 | {
31 | description: "Ingress from AMPQS protocol",
32 | fromPort: 5671,
33 | toPort: 5671,
34 | protocol: "tcp",
35 | },
36 | {
37 | description: "Ingress from HTTPS protocol",
38 | fromPort: 443,
39 | toPort: 443,
40 | protocol: "tcp",
41 | },
42 | ],
43 | egress: [{
44 | protocol: "-1",
45 | description: "Allow all outbound traffic by default",
46 | fromPort: 0,
47 | toPort: 0,
48 | cidrBlocks: ["0.0.0.0/0"],
49 | }],
50 | tags: {
51 | Name: `${name}-mq-sg`
52 | },
53 | }, {parent: this})
54 |
55 | const broker = new aws.mq.Broker(`${name}-mq-broker`, {
56 | brokerName: `${name}-mq-broker`,
57 | engineType: "RabbitMQ",
58 | engineVersion: props.engineVersion || "3.11.20",
59 | hostInstanceType: props.instanceType || "mq.t3.micro",
60 | securityGroups: [securityGroup.id],
61 | deploymentMode: "SINGLE_INSTANCE",
62 | logs: {
63 | general: true,
64 | },
65 | publiclyAccessible: true,
66 | subnetIds: pulumi.output(aws.ec2.getSubnets({tags: {Type: 'public'}})).ids,
67 | users: pulumi.all([
68 | this.getSecretValue(`arn:aws:secretsmanager:${region}:${accountId}:secret:/${name}/broker/admin`),
69 | this.getSecretValue(`arn:aws:secretsmanager:${region}:${accountId}:secret:/${name}/broker/replication-user`)
70 | ]).apply(([adminSecret, replicationUserSecret]) => [
71 | {
72 | username: JSON.parse(adminSecret).username,
73 | password: JSON.parse(adminSecret).password,
74 | consoleAccess: true,
75 | },
76 | {
77 | username: JSON.parse(replicationUserSecret).username,
78 | password: JSON.parse(replicationUserSecret).password,
79 | consoleAccess: true,
80 | replicationUser: true
81 | }
82 | ]),
83 | tags: {
84 | Name: `${name}-mq-sg`
85 | },
86 | }, {parent: this})
87 |
88 | const hostSSMParameter = new aws.ssm.Parameter(`${name}-mq-broker-host-ssm-parameter`, {
89 | name: `/${name}/broker/host`,
90 | type: aws.ssm.ParameterType.String,
91 | description: `RabbitMQ cluster host for ${name}-mq-broker`,
92 | value: broker.instances[0].endpoints[0].apply(endpoint => {
93 | return endpoint.split(":")[0];
94 | }),
95 | }, {parent: this})
96 |
97 | const portSSMParameter = new aws.ssm.Parameter(`${name}-mq-broker-port-ssm-parameter`, {
98 | name: `/${name}/broker/port`,
99 | type: aws.ssm.ParameterType.String,
100 | description: `RabbitMQ cluster port for ${name}-mq-broker`,
101 | value: "5671",
102 | }, {parent: this})
103 | }
104 |
105 | private async getSecretValue(secretName: string): Promise> {
106 | return pulumi.output(aws.secretsmanager.getSecretVersion({
107 | secretId: secretName,
108 | }, { async: true })).apply(secretVersion => {
109 | if (!secretVersion.secretString) {
110 | throw new Error("Secret version contains no string data");
111 | }
112 | return secretVersion.secretString;
113 | });
114 | }
115 | }
--------------------------------------------------------------------------------
/ops/components/nat.ts:
--------------------------------------------------------------------------------
1 | import * as pulumi from "@pulumi/pulumi";
2 | import * as aws from "@pulumi/aws";
3 | import {SubnetCidrBlocks} from "./config";
4 |
5 | export interface NatGatewayProps {
6 | env: pulumi.Input
7 | vpcId: pulumi.Input
8 | subnet: pulumi.Input
9 |
10 | instanceImageAmiId?: pulumi.Input
11 | }
12 |
13 | export class NatGateway extends pulumi.ComponentResource {
14 | public readonly id: pulumi.Output
15 |
16 | constructor(
17 | name: string,
18 | props: NatGatewayProps,
19 | opts?: pulumi.ComponentResourceOptions,
20 | ) {
21 | super("decodingml:main:NatGateway", name, {}, opts);
22 |
23 | const config = new pulumi.Config();
24 |
25 | const sg = new aws.ec2.SecurityGroup(`${name}-security-group`, {
26 | description: "Security Group for NAT Gateway",
27 | ingress: [
28 | {
29 | cidrBlocks: [SubnetCidrBlocks.VPC],
30 | description: "Allow all inbound traffic from network",
31 | protocol: "-1",
32 | fromPort: 0,
33 | toPort: 0,
34 | },
35 | ],
36 | egress: [
37 | {
38 | cidrBlocks: ['0.0.0.0/0'],
39 | description: "Allow all outbound traffic by default",
40 | protocol: "-1",
41 | fromPort: 0,
42 | toPort: 0,
43 | },
44 | ],
45 | vpcId: props.vpcId,
46 | }, {parent: this})
47 |
48 | const iamRole = new aws.iam.Role(`${name}-role`, {
49 | assumeRolePolicy: {
50 | Version: '2012-10-17',
51 | Statement: [
52 | {
53 | Action: ['sts:AssumeRole'],
54 | Effect: 'Allow',
55 | Principal: {
56 | Service: 'ec2.amazonaws.com',
57 | },
58 | },
59 | ],
60 | },
61 | managedPolicyArns: [
62 | `arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore`,
63 | ],
64 | inlinePolicies: [
65 | {
66 | name: 'for-nat',
67 | policy: JSON.stringify({
68 | Statement: [
69 | {
70 | Action: [
71 | 'ec2:AttachNetworkInterface',
72 | 'ec2:ModifyNetworkInterfaceAttribute',
73 | 'ec2:AssociateAddress',
74 | 'ec2:DisassociateAddress',
75 | 'ec2:*',
76 | ],
77 | Effect: 'Allow',
78 | Resource: '*',
79 | },
80 | ],
81 | Version: '2012-10-17',
82 | } as aws.iam.PolicyDocument),
83 | },
84 | ],
85 |
86 | }, {parent: this})
87 |
88 | const eni = new aws.ec2.NetworkInterface(`${name}-eni`, {
89 | subnetId: props.subnet,
90 | securityGroups: [sg.id],
91 | sourceDestCheck: false,
92 | }, {parent: this})
93 |
94 | this.id = eni.id
95 |
96 | const instanceProfile = new aws.iam.InstanceProfile(`${name}-instance-profile`, {
97 | role: iamRole
98 | }, {parent: this})
99 |
100 | const launchTemplate = new aws.ec2.LaunchTemplate(`${name}-launch-template`, {
101 | name: `pi-${props.env}-nat-launch-template`,
102 | imageId: config.require('natInstanceImageId'),
103 | instanceType: 't4g.nano',
104 | iamInstanceProfile: { arn: instanceProfile.arn },
105 | vpcSecurityGroupIds: [ sg.id ],
106 | userData: eni.id.apply(id =>
107 | Buffer.from(
108 | [
109 | '#!/bin/bash',
110 | `echo "eni_id=${id}" >> /etc/fck-nat.conf`,
111 | 'service fck-nat restart',
112 | ].join('\n'),
113 | ).toString('base64'),
114 | ),
115 | tags: {
116 | Name: `pi-${props.env}-nat-launch-template`
117 | },
118 | tagSpecifications: [{
119 | tags: {
120 | Name: `pi-${props.env}-nat-launch-template`
121 | },
122 | resourceType: 'instance'
123 | }]
124 | }, {dependsOn: instanceProfile, parent: this})
125 |
126 |
127 | new aws.autoscaling.Group(`${name}-autoscaling-group`, {
128 | maxSize: 1,
129 | minSize: 1,
130 | desiredCapacity: 1,
131 | launchTemplate: {
132 | id: launchTemplate.id,
133 | version: '$Latest',
134 | },
135 | vpcZoneIdentifiers: [ props.subnet ],
136 | tags: [{ key: 'Name', value: `pi-${props.env}-nat-instance-launch-template`, propagateAtLaunch: true }]
137 | }, {parent: this})
138 |
139 | this.registerOutputs({
140 | id: this.id,
141 | })
142 | }
143 | }
--------------------------------------------------------------------------------
/ops/components/repository.ts:
--------------------------------------------------------------------------------
1 | import * as pulumi from '@pulumi/pulumi'
2 | import * as aws from '@pulumi/aws'
3 |
4 | interface Props {}
5 |
6 | export class Repository extends pulumi.ComponentResource {
7 | public name: pulumi.Output
8 | public arn: pulumi.Output
9 | public url: pulumi.Output
10 |
11 | public static dockerTags = {
12 | github: 'github-crawler-latest',
13 | linkedin: 'linkedin-crawler-latest',
14 | medium: 'medium-crawler-latest',
15 |
16 | } as const
17 |
18 | private readonly tags = {
19 | module: 'ai',
20 | scope: 'ecr',
21 | }
22 |
23 | constructor(
24 | name: string,
25 | props: Props,
26 | opts?: pulumi.ComponentResourceOptions,
27 | ) {
28 | super('deocingml:ai:ecr', name, {}, opts)
29 |
30 | const ecr = new aws.ecr.Repository(
31 | `${name}-repository`,
32 | {
33 | name,
34 | tags: this.tags,
35 | imageTagMutability: 'MUTABLE',
36 | },
37 | { parent: this },
38 | )
39 |
40 | new aws.ecr.LifecyclePolicy(
41 | `${name}-lifecycle-policy`,
42 | {
43 | repository: ecr.name,
44 | policy: {
45 | rules: [
46 | {
47 | action: { type: 'expire' },
48 | selection: {
49 | tagStatus: 'untagged',
50 | countNumber: 30,
51 | countUnit: 'days',
52 | countType: 'sinceImagePushed',
53 | },
54 | rulePriority: 1,
55 | description: 'Delete older than 30 days images with no tag.',
56 | },
57 | ],
58 | },
59 | },
60 | { parent: this },
61 | )
62 |
63 | this.arn = ecr.arn
64 | this.name = ecr.name
65 | this.url = ecr.repositoryUrl
66 |
67 | this.registerOutputs()
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/ops/components/vpc.ts:
--------------------------------------------------------------------------------
1 | import * as pulumi from '@pulumi/pulumi'
2 | import * as aws from '@pulumi/aws'
3 | import {SubnetCidrBlocks} from "./config";
4 |
5 | interface VpcProps {}
6 |
7 | export class Vpc extends pulumi.ComponentResource {
8 | public readonly id: pulumi.Output
9 |
10 | constructor(
11 | name: string,
12 | props: VpcProps,
13 | opts?: pulumi.ComponentResourceOptions,
14 | ) {
15 | super("decodingml:main:Vpc", name, {}, opts);
16 |
17 | const vpc = new aws.ec2.Vpc(`${name}-vpc`, {
18 | cidrBlock: SubnetCidrBlocks.VPC,
19 | enableDnsSupport: true,
20 | enableDnsHostnames: true,
21 | tags: {
22 | Name: `${name}-vpc`,
23 | },
24 | }, { parent: this });
25 |
26 | this.id = vpc.id
27 |
28 | const azs = aws.getAvailabilityZones({
29 | state: "available"
30 | })
31 |
32 | const publicSubnetOne = new aws.ec2.Subnet(`${name}-public-subnet-one`, {
33 | vpcId: vpc.id,
34 | availabilityZone: azs.then(azs => azs.names?.[0]),
35 | cidrBlock: SubnetCidrBlocks.PublicOne,
36 | mapPublicIpOnLaunch: true,
37 | tags: {
38 | Name: `${name}-public-subnet-one`,
39 | Type: 'public',
40 | }
41 | }, {parent: this})
42 |
43 | const publicSubnetTwo = new aws.ec2.Subnet(`${name}-public-subnet-two`, {
44 | vpcId: vpc.id,
45 | availabilityZone: azs.then(azs => azs.names?.[1]),
46 | cidrBlock: SubnetCidrBlocks.PublicTwo,
47 | mapPublicIpOnLaunch: true,
48 | tags: {
49 | Name: `${name}-public-subnet-two`,
50 | Type: 'public',
51 | }
52 | }, {parent: this})
53 |
54 | // Setup networking resources for the public subnets.
55 | const internetGateway= new aws.ec2.InternetGateway(`${name}-internet-gateway`, {
56 | vpcId: vpc.id,
57 | tags: {
58 | Name: `${name}-internet-gateway`
59 | }
60 | }, {parent: this})
61 |
62 | const publicRouteTable = new aws.ec2.RouteTable(`${name}-public-route-table`, {
63 | vpcId: vpc.id,
64 | tags: {
65 | Name: `${name}-public-route-table`
66 | }
67 | }, {parent: this})
68 |
69 | new aws.ec2.Route(`${name}-public-route`, {
70 | routeTableId: publicRouteTable.id,
71 | destinationCidrBlock: "0.0.0.0/0",
72 | gatewayId: internetGateway.id
73 | }, {parent: this})
74 |
75 | new aws.ec2.RouteTableAssociation(`${name}-public-subnet-one-rta`, {
76 | subnetId: publicSubnetOne.id,
77 | routeTableId: publicRouteTable.id
78 | }, {parent: this})
79 | new aws.ec2.RouteTableAssociation(`${name}-public-subnet-two-rta`, {
80 | subnetId: publicSubnetTwo.id,
81 | routeTableId: publicRouteTable.id,
82 | }, {parent: this})
83 |
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/ops/index.ts:
--------------------------------------------------------------------------------
1 | import * as pulumi from "@pulumi/pulumi";
2 | import {Vpc} from "./components/vpc";
3 | import {DocumentDBCluster} from "./components/docdb";
4 | import {Crawler} from "./components/crawler";
5 | import {ECSCluster} from "./components/ecs/cluster";
6 | import {Service} from "./components/ecs/service";
7 |
8 | const vpc= new Vpc("network-overlay", {})
9 |
10 | const docdb = new DocumentDBCluster("warehouse", {
11 | vpcId: vpc.id,
12 | instanceClass: "db.t3.medium",
13 | }, {dependsOn: vpc})
14 |
15 | const lambda = new Crawler("crawler", {
16 | vpcId: vpc.id,
17 | timeout: 900,
18 | memory: 3008
19 | })
20 |
21 | const cluster = new ECSCluster("streaming", {
22 | vpcId: vpc.id
23 | })
24 |
25 | const bytewaxWorker = new Service("bytewax-worker", {
26 | vpcId: vpc.id,
27 | cluster: cluster.name,
28 | containerPort: 9000,
29 | secrets: [
30 | {
31 | name: "MONGO_DATABASE_HOST",
32 | parameter: "database/host",
33 | },
34 | {
35 | name: "OPENAI_API_KEY",
36 | parameter: "database/host",
37 | },
38 | {
39 | name: "QDRANT_DATABASE_HOST",
40 | parameter: "database/username",
41 | },
42 | {
43 | name: "QDRANT_DATABASE_PORT",
44 | parameter: "database/host",
45 | },
46 | {
47 | name: "QDRANT_APIKEY",
48 | parameter: "database/username",
49 | },
50 | {
51 | name: "RABBITMQ_HOST",
52 | parameter: "database/host",
53 | },
54 | {
55 | name: "RABBITMQ_PORT",
56 | parameter: "database/username",
57 | },
58 | {
59 | name: "RABBITMQ_DEFAULT_USERNAME",
60 | parameter: "database/host",
61 | },
62 | {
63 | name: "RABBITMQ_DEFAULT_PASSWORD",
64 | parameter: "database/username",
65 | },
66 | ]
67 | })
68 |
69 |
70 | export const VpcID: pulumi.Output = vpc.id
71 |
--------------------------------------------------------------------------------
/ops/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "decodingml",
3 | "main": "index.ts",
4 | "devDependencies": {
5 | "@types/node": "^18"
6 | },
7 | "dependencies": {
8 | "@pulumi/pulumi": "^3.0.0",
9 | "@pulumi/aws": "^6.0.0",
10 | "@pulumi/awsx": "^2.0.2"
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/ops/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "strict": true,
4 | "outDir": "bin",
5 | "target": "es2016",
6 | "module": "commonjs",
7 | "moduleResolution": "node",
8 | "sourceMap": true,
9 | "experimentalDecorators": true,
10 | "pretty": true,
11 | "noFallthroughCasesInSwitch": true,
12 | "noImplicitReturns": true,
13 | "forceConsistentCasingInFileNames": true
14 | },
15 | "files": [
16 | "index.ts"
17 | ]
18 | }
19 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "rag-system"
3 | description = ""
4 | version = "0.1.0"
5 | authors = [
6 | "Vlad Adumitracesei ",
7 | "Paul Iusztin ",
8 | "Alex Vesa ",
9 | "Rares Istoc "
10 | ]
11 | readme = "README.md"
12 |
13 | [tool.ruff]
14 | line-length = 88
15 | select = [
16 | "F401",
17 | "F403",
18 | ]
19 |
20 |
21 | [tool.poetry.dependencies]
22 | python = ">=3.10, <3.12"
23 | pydantic = "^2.6.3"
24 | pydantic-settings = "^2.1.0"
25 | pika = "^1.3.2"
26 | qdrant-client = "^1.8.0"
27 | langchain = "^0.1.13"
28 | aws-lambda-powertools = "^2.38.1"
29 | selenium = "4.21.0"
30 | instructorembedding = "^1.0.1"
31 | numpy = "^1.26.4"
32 | langchain-openai = "^0.1.3"
33 | gdown = "^5.1.0"
34 | pymongo = "^4.7.1"
35 | structlog = "^24.1.0"
36 | rich = "^13.7.1"
37 | pip = "^24.0"
38 | comet-ml = "^3.41.0"
39 | ruff = "^0.4.3"
40 | pandas = "^2.0.3"
41 | datasets = "^2.19.1"
42 | transformers = "^4.40.2"
43 | safetensors = "^0.4.3"
44 | bitsandbytes = "^0.42.0"
45 | scikit-learn = "^1.4.2"
46 | unstructured = "^0.14.2"
47 |
48 | [tool.poetry.group.3-feature-pipeline.dependencies]
49 | bytewax = "0.18.2"
50 |
51 | [tool.poetry.group.ml.dependencies]
52 | qwak-inference = "^0.1.17"
53 | comet-llm = "^2.2.4"
54 | qwak-sdk = "^0.5.69"
55 | peft = "^0.11.1"
56 | sentence-transformers = "^2.2.2"
57 | accelerate = "^0.30.1"
58 |
59 | [tool.poetry.group.6-superlinked-rag.dependencies]
60 | superlinked = "7.2.1"
61 |
62 | [build-system]
63 | requires = ["poetry-core"]
64 | build-backend = "poetry.core.masonry.api"
65 |
--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
1 | from pydantic_settings import BaseSettings, SettingsConfigDict
2 |
3 |
4 | class AppSettings(BaseSettings):
5 | model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
6 |
7 | # Embeddings config
8 | EMBEDDING_MODEL_ID: str = "sentence-transformers/all-MiniLM-L6-v2"#instruct-xl
9 | EMBEDDING_MODEL_MAX_INPUT_LENGTH: int = 256
10 | EMBEDDING_SIZE: int = 384
11 | EMBEDDING_MODEL_DEVICE: str = "cpu"
12 |
13 | OPENAI_MODEL_ID: str = "gpt-4-1106-preview"
14 | OPENAI_API_KEY: str | None = None
15 |
16 | # MongoDB configs
17 | MONGO_DATABASE_HOST: str = "mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set"
18 | MONGO_DATABASE_NAME: str = "scrabble"
19 |
20 | # QdrantDB config
21 | QDRANT_DATABASE_HOST: str = "localhost"
22 | QDRANT_DATABASE_PORT: int = 6333
23 | QDRANT_DATABASE_URL: str = "http://localhost:6333"
24 | QDRANT_CLOUD_URL: str = "str"
25 | USE_QDRANT_CLOUD: bool = False
26 | QDRANT_APIKEY: str | None = None
27 |
28 | # MQ config
29 | RABBITMQ_DEFAULT_USERNAME: str = "guest"
30 | RABBITMQ_DEFAULT_PASSWORD: str = "guest"
31 | RABBITMQ_HOST: str = "localhost"
32 | RABBITMQ_PORT: int = 5673
33 |
34 | # CometML config
35 | COMET_API_KEY: str | None = None
36 | COMET_WORKSPACE: str | None = None
37 | COMET_PROJECT: str | None = None
38 |
39 | # LinkedIn credentials
40 | LINKEDIN_USERNAME: str | None = None
41 | LINKEDIN_PASSWORD: str | None = None
42 |
43 |
44 | settings = AppSettings()
45 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import structlog
2 |
3 |
4 | def get_logger(cls: str):
5 | return structlog.get_logger().bind(cls=cls)
6 |
--------------------------------------------------------------------------------