├── app ├── .gitkeep ├── llm-gemma-variant │ ├── readme.md │ ├── src │ │ ├── __init__.py │ │ ├── test.py │ │ ├── model.py │ │ ├── backend.py │ │ └── vector_db.py │ ├── tests │ │ └── __init__.py │ ├── makefile │ └── pyproject.toml ├── course-scraper │ ├── src │ │ ├── scrapers │ │ │ ├── kaggle_learn │ │ │ │ ├── __init__.py │ │ │ │ ├── pages.py │ │ │ │ ├── README.md │ │ │ │ ├── scrape_course.py │ │ │ │ ├── scrape_all_courses.py │ │ │ │ └── models.py │ │ │ └── google_cloud_skill_boost │ │ │ │ ├── __init__.py │ │ │ │ ├── scrape_focus.py │ │ │ │ ├── models.py │ │ │ │ ├── pages.py │ │ │ │ ├── README.md │ │ │ │ ├── scrape_journey.py │ │ │ │ └── scrape_course_template.py │ │ ├── config.py │ │ ├── gsheet.py │ │ └── utils.py │ ├── requirements.txt │ └── README.md ├── llm-poc-variant-01 │ ├── deploy │ │ ├── aws │ │ │ ├── .gitignore │ │ │ ├── keypair.tf │ │ │ ├── provider.tf │ │ │ ├── security_groups.tf │ │ │ ├── outputs.tf │ │ │ ├── main.tf │ │ │ └── README.md │ │ └── gcp │ │ │ ├── .gitignore │ │ │ ├── project.tfvars │ │ │ ├── chainlit-app-demo.gif │ │ │ ├── ollama.service │ │ │ ├── provider.tf │ │ │ ├── outputs.tf │ │ │ ├── main.tf │ │ │ └── README.md │ ├── .gitignore │ ├── docker │ │ ├── Dockerfile │ │ ├── run-docker-container.sh │ │ └── build-docker-image.sh │ ├── constants.py │ ├── chainlit_app.py │ ├── requirements.txt │ ├── lpiGPT.py │ ├── ingest.py │ └── README.md ├── Jorge_Rocha_campos_ML - Google Docs.pdf └── llm-poc-variant-02 │ ├── .env_template │ ├── requirements.txt │ ├── interface.py │ ├── faiss_index.py │ ├── README.md │ ├── main.py │ └── learning_path_index_contextual_search.ipynb ├── chainlit.md ├── data ├── .gitkeep ├── utils │ ├── requirements.txt │ ├── get-kaggle-dataset-meta-data.py │ └── README.md ├── dataset-metadata.json └── Courses_and_Learning_Material.csv ├── docs └── .gitkeep ├── .github ├── CONTRIBUTING.md ├── workflows │ └── .gitkeep └── CODEOWNERS ├── requirements ├── llm-poc-variant-01.txt ├── base.txt ├── llm-poc-variant-02.txt └── scraper.txt ├── pyproject.toml ├── .pre-commit-config.yaml ├── LICENSE ├── getting-started.md ├── .gitignore └── README.md /app/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /chainlit.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/workflows/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @neomatrix369 -------------------------------------------------------------------------------- /app/llm-gemma-variant/readme.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/llm-gemma-variant/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/llm-gemma-variant/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/utils/requirements.txt: -------------------------------------------------------------------------------- 1 | kaggle==1.5.16 -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/kaggle_learn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/google_cloud_skill_boost/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/google_cloud_skill_boost/scrape_focus.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/aws/.gitignore: -------------------------------------------------------------------------------- 1 | .terraform* 2 | terraform.* -------------------------------------------------------------------------------- /app/llm-gemma-variant/makefile: -------------------------------------------------------------------------------- 1 | run-gemma: 2 | poetry run python src/backend.py -------------------------------------------------------------------------------- /app/llm-poc-variant-01/.gitignore: -------------------------------------------------------------------------------- 1 | source_documents 2 | vector_db 3 | .python-version -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/gcp/.gitignore: -------------------------------------------------------------------------------- 1 | .terraform* 2 | terraform.* 3 | .service_account_credentials.json 4 | *.plan 5 | -------------------------------------------------------------------------------- /app/course-scraper/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neomatrix369/learning-path-index/HEAD/app/course-scraper/requirements.txt -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/gcp/project.tfvars: -------------------------------------------------------------------------------- 1 | project_id = "kagglex-llm-demo" 2 | region = "europe-west1" 3 | zone = "europe-west1-b" 4 | -------------------------------------------------------------------------------- /requirements/llm-poc-variant-01.txt: -------------------------------------------------------------------------------- 1 | langchain==0.0.261 2 | chromadb==0.3.26 3 | joblib 4 | tqdm==4.65.0 5 | sentence_transformers==2.2.2 6 | -------------------------------------------------------------------------------- /app/Jorge_Rocha_campos_ML - Google Docs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neomatrix369/learning-path-index/HEAD/app/Jorge_Rocha_campos_ML - Google Docs.pdf -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/gcp/chainlit-app-demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neomatrix369/learning-path-index/HEAD/app/llm-poc-variant-01/deploy/gcp/chainlit-app-demo.gif -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/aws/keypair.tf: -------------------------------------------------------------------------------- 1 | resource "aws_key_pair" "lpi-key" { 2 | key_name = "lpi-key" 3 | public_key = file("~/.ssh/lpi-key.pub") 4 | tags = { 5 | Name = "lpi-key" 6 | } 7 | } -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | cfgv==3.4.0 2 | distlib==0.3.8 3 | filelock==3.16.1 4 | identify==2.6.1 5 | nodeenv==1.9.1 6 | platformdirs==4.3.6 7 | pre_commit==4.0.0 8 | PyYAML==6.0.2 9 | virtualenv==20.26.6 10 | -------------------------------------------------------------------------------- /app/llm-poc-variant-02/.env_template: -------------------------------------------------------------------------------- 1 | # This file won't become part of the git history as long as it exists in 2 | # the .gitignore file, and it should stay like that 3 | OPENAI_API_KEY= 4 | PINECONE_API_KEY= -------------------------------------------------------------------------------- /app/llm-poc-variant-02/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.0.216 2 | streamlit==1.27.2 3 | tqdm==4.65.0 4 | # Pre-requisites: [sudo] apt install libopenblas-base libomp-dev 5 | # See https://github.com/onfido/faiss_prebuilt 6 | faiss-cpu==1.7.4 7 | faiss-gpu==1.7.2 -------------------------------------------------------------------------------- /requirements/llm-poc-variant-02.txt: -------------------------------------------------------------------------------- 1 | langchain==0.0.216 2 | streamlit==1.27.2 3 | tqdm==4.65.0 4 | # Pre-requisites: [sudo] apt install libopenblas-base libomp-dev 5 | # See https://github.com/onfido/faiss_prebuilt 6 | faiss-cpu==1.7.4 7 | faiss-gpu==1.7.2 8 | -------------------------------------------------------------------------------- /requirements/scraper.txt: -------------------------------------------------------------------------------- 1 | annotated-types==0.5.0 2 | certifi==2023.7.22 3 | charset-normalizer==3.2.0 4 | idna==3.4 5 | lxml==4.9.3 6 | pydantic==1.9.2 7 | pydantic_core==2.10.1 8 | requests==2.31.0 9 | typing_extensions==4.8.0 10 | urllib3==2.0.5 11 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform="linux/amd64" python:3.10-bookworm 2 | 3 | COPY . . 4 | 5 | ARG REQUESTS_CA_BUNDLE 6 | ENV REQUESTS_CA_BUNDLE="${REQUESTS_CA_BUNDLE:-}" 7 | 8 | RUN pip install -r requirements.txt 9 | 10 | ENTRYPOINT ["/bin/bash"] -------------------------------------------------------------------------------- /app/course-scraper/src/config.py: -------------------------------------------------------------------------------- 1 | class CONFIG: 2 | # XXX: Modify as needed 3 | DATA_PATH = r'data' 4 | 5 | GCSB_JOURNEY_URL = 'https://www.cloudskillsboost.google/journeys/17' 6 | 7 | # CHROME_USER_DATA_DIR = r"C:\Users\user\AppData\Local\Google\Chrome\User Data" 8 | # CHROME_USER = "Default" 9 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/gcp/ollama.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Ollama Service 3 | After=network-online.target 4 | 5 | [Service] 6 | ExecStart=/usr/bin/ollama serve 7 | User=ollama 8 | Group=ollama 9 | Restart=always 10 | RestartSec=3 11 | Environment="PATH=$PATH" 12 | 13 | [Install] 14 | WantedBy=default.target 15 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/aws/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | ### AWS: https://registry.terraform.io/providers/hashicorp/aws/latest 4 | aws = { 5 | source = "hashicorp/aws" 6 | version = "~> 5.26.0" 7 | } 8 | } 9 | } 10 | 11 | provider "aws" { 12 | region = "eu-central-1" 13 | profile = "default" 14 | } -------------------------------------------------------------------------------- /app/llm-poc-variant-01/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from chromadb.config import Settings 4 | 5 | # Define the folder for storing database 6 | PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY', 'vector_db') 7 | 8 | # Define the Chroma settings 9 | CHROMA_SETTINGS = Settings( 10 | chroma_db_impl='duckdb+parquet', 11 | persist_directory=PERSIST_DIRECTORY, 12 | anonymized_telemetry=False, 13 | ) 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | line-length = 88 3 | select = [ 4 | "C", # mccabe rules 5 | "F", # pyflakes rules 6 | "E", # pycodestyle error rules 7 | "W", # pycodestyle warning rules 8 | "B", # flake8-bugbear rules 9 | "I", # isort rules 10 | ] 11 | ignore = [ 12 | "C901", # max-complexity-10 13 | "E501", # line-too-long 14 | ] 15 | 16 | [tool.ruff.format] 17 | indent-style = "space" 18 | quote-style = "single" 19 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-toml 6 | - id: check-yaml 7 | - id: end-of-file-fixer 8 | - id: trailing-whitespace 9 | 10 | - repo: https://github.com/astral-sh/ruff-pre-commit 11 | rev: v0.1.5 12 | hooks: 13 | - id: ruff 14 | args: [--fix, --exit-non-zero-on-fix, --show-fixes] 15 | - id: ruff-format 16 | -------------------------------------------------------------------------------- /app/course-scraper/src/gsheet.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | 6 | def connect_to_gsheet(sheet_id: str): 7 | # Connect to a Google Sheet 8 | ... 9 | 10 | 11 | class GSheetWriter: 12 | """ 13 | TODO: Emulate the behaviour of csv.writer and csv.DictWriter, 14 | but instead write to a Google Sheet 15 | """ 16 | 17 | ... 18 | 19 | 20 | class GSheetReader: 21 | """ 22 | TODO: Emulate the behaviour of csv.reader and csv.DictReader, 23 | but instead write to a Google Sheet 24 | """ 25 | 26 | ... 27 | 28 | 29 | # Write a new row 30 | -------------------------------------------------------------------------------- /app/llm-gemma-variant/src/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | from llama_index.core import Settings 4 | from loguru import logger 5 | from llama_index.llms.ollama import Ollama 6 | from model import Gemma 7 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 8 | from vector_db import VectorDB 9 | from pathlib import Path 10 | 11 | # Initialize the model 12 | gemma2_2b = Ollama(model="gemma2:2b", request_timeout=60.0) 13 | logger.debug(gemma2_2b.complete("Hello, how are you?")) 14 | llm_model = Gemma(gemma2_2b, 2000) 15 | Settings.llm = llm_model 16 | 17 | print(llm_model.complete(prompt = "Hello, how are you?")) -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/aws/security_groups.tf: -------------------------------------------------------------------------------- 1 | resource "aws_security_group" "lpi-sg" { 2 | name = "lpi-sg" 3 | description = "LPI Security Group" 4 | 5 | ingress { 6 | description = "SSH" 7 | from_port = 22 8 | to_port = 22 9 | protocol = "tcp" 10 | cidr_blocks = ["0.0.0.0/0"] 11 | ipv6_cidr_blocks = ["::/0"] 12 | } 13 | 14 | egress { 15 | from_port = 0 16 | to_port = 0 17 | protocol = "-1" 18 | cidr_blocks = ["0.0.0.0/0"] 19 | ipv6_cidr_blocks = ["::/0"] 20 | } 21 | 22 | tags = { 23 | Name = "lpi-sg" 24 | } 25 | } -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/kaggle_learn/pages.py: -------------------------------------------------------------------------------- 1 | """ 2 | Selectors for different HTML pages 3 | Plural attributes imply a list of elements are returned by the xpath, 4 | rather than a single element 5 | """ 6 | 7 | 8 | class KaggleLearnCourseListPage: 9 | """ 10 | Page found at https://www.kaggle.com/learn 11 | """ 12 | 13 | courses = "//section[@data-testid='course-catalog']//li[@role='listitem']" 14 | course_link = "//a/@href" 15 | course_description = "//span/text()" 16 | course_title = "//span/preceding-sibling::div/text()" 17 | 18 | 19 | class KaggleLearnCourseDetailPage: 20 | """ 21 | E.g https://www.kaggle.com/learn/intro-to-programming 22 | """ 23 | 24 | ... 25 | -------------------------------------------------------------------------------- /app/llm-poc-variant-02/interface.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | # Define your Streamlit app and return the input variable 4 | def app(): 5 | # Add a title to your app 6 | st.title("KaggleX Learning Path Index Search") 7 | 8 | # Add some text to your app 9 | st.write("Embark your Learning Path Journey with right search !!") 10 | 11 | # Add a text input to your app 12 | user_input = st.text_input("Enter your course query here") 13 | 14 | # Store the input in a variable 15 | my_variable = user_input 16 | # Display the stored variable 17 | # st.write(f"The stored variable is: {my_variable}") 18 | 19 | return my_variable 20 | 21 | # Run your Streamlit app 22 | # if __name__ == "__main__": 23 | # var = app() 24 | # print(var) -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/gcp/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "gcs" { 3 | bucket= "llm-project-sbx-tf-state" 4 | prefix = "static.tfstate.d" 5 | } 6 | 7 | required_providers { 8 | ### GCP: https://registry.terraform.io/providers/hashicorp/google/latest 9 | google = { 10 | source = "hashicorp/google" 11 | version = "~> 4.0" 12 | } 13 | } 14 | } 15 | 16 | variable "project_id" { 17 | type = string 18 | description = "The ID of the GCP project" 19 | } 20 | 21 | variable "region" { 22 | type = string 23 | description = "The region of the GCP project" 24 | } 25 | 26 | variable "zone" { 27 | type = string 28 | description = "The zone of the GCP project" 29 | } 30 | 31 | provider "google" { 32 | project = var.project_id 33 | region = var.region 34 | } 35 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/gcp/outputs.tf: -------------------------------------------------------------------------------- 1 | output "self_link" { 2 | description = "The self link of the instance" 3 | value = try(google_compute_instance.lpi-cpu-vm.self_link, "") 4 | } 5 | 6 | output "network_interface_0_access_config_0_nat_ip" { 7 | description = "The external IP address assigned to the instance" 8 | value = try(google_compute_instance.lpi-cpu-vm.network_interface[0].access_config[0].nat_ip, "") 9 | } 10 | 11 | output "network_interface_0_network_ip" { 12 | description = "The internal IP address assigned to the instance" 13 | value = try(google_compute_instance.lpi-cpu-vm.network_interface[0].network_ip, "") 14 | } 15 | 16 | output "tags" { 17 | description = "A map of tags assigned to the resource" 18 | value = try(google_compute_instance.lpi-cpu-vm.tags, {}) 19 | } 20 | -------------------------------------------------------------------------------- /app/llm-gemma-variant/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "llm-gemma-variant" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Tacoman99 "] 6 | readme = "README.md" 7 | packages = [{include = "llm_gemma_variant", from = "src"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.11" 11 | llama-index-vector-stores-weaviate = "^1.1.3" 12 | llama-index = "^0.11.21" 13 | weaviate-client = "^4.9.3" 14 | python-dotenv = "^1.0.1" 15 | llama-index-llms-nvidia = "^0.2.6" 16 | loguru = "^0.7.2" 17 | torch = "^2.5.1" 18 | llama-index-llms-ollama = "^0.3.4" 19 | llama-index-llms-huggingface-api = "^0.2.0" 20 | llama-index-llms-huggingface = "^0.3.5" 21 | llama-index-embeddings-huggingface = "^0.3.1" 22 | 23 | 24 | [build-system] 25 | requires = ["poetry-core"] 26 | build-backend = "poetry.core.masonry.api" 27 | -------------------------------------------------------------------------------- /app/course-scraper/src/utils.py: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | SAFECHARS = string.ascii_lowercase + string.ascii_uppercase + string.digits + ".-" 4 | 5 | 6 | def get_safestring(string: str): 7 | return "".join([c for c in string if c in SAFECHARS]) 8 | 9 | 10 | def find_element_by_xpath(dom, xpath): 11 | return dom.xpath(xpath)[0] 12 | 13 | 14 | def find_elements_by_xpath(dom, xpath): 15 | return dom.xpath(xpath) 16 | 17 | 18 | def login_selenium_driver_to_gcb(driver: "WebDriver"): 19 | from scrapers.google_cloud_skill_boost import pages 20 | 21 | driver.get(CONFIG.GCB_LOGIN_URL) 22 | print(driver.title) 23 | driver.find_element("xpath", pages.GCSBSignInPage.user_email).send_keys( 24 | CONFIG.GCB_EMAIL 25 | ) 26 | driver.find_element("xpath", pages.GCSBSignInPage.user_password).send_keys( 27 | CONFIG.GCB_PASSWORD 28 | ) 29 | driver.find_element("xpath", pages.GCSBSignInPage.sign_in_button).click() 30 | -------------------------------------------------------------------------------- /data/utils/get-kaggle-dataset-meta-data.py: -------------------------------------------------------------------------------- 1 | # Original code: https://lindevs.com/get-dataset-metadata-from-kaggle-using-api-and-python/ 2 | 3 | import os 4 | import json 5 | from pprint import pprint 6 | 7 | from kaggle.api.kaggle_api_extended import KaggleApi 8 | 9 | owner = 'neomatrix369' 10 | datasetName = 'learning-path-index-dataset' 11 | 12 | api = KaggleApi() 13 | api.authenticate() 14 | 15 | print(f"\nFetching the metadata of {owner}/{datasetName}") 16 | metadata = api.metadata_get(owner, datasetName) 17 | 18 | print(f"\nPrinting the metadata of {owner}/{datasetName}") 19 | pprint(metadata) 20 | 21 | metadata_filename = "../dataset-metadata.json" 22 | metadata_file = open(metadata_filename, "w") 23 | try: 24 | metadata_as_str = json.dumps(metadata, indent=2) ### Formats the JSON when saving it 25 | metadata_file.write(metadata_as_str) 26 | print(f"\nSaving the metadata to {metadata_filename}") 27 | finally: 28 | metadata_file.close() 29 | -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/google_cloud_skill_boost/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Optional 3 | from typing import Union 4 | 5 | 6 | class Activity(BaseModel): 7 | """ 8 | For clarity, during visualization, skip the intermediate model 9 | and show CourseSubmodule - Activity relationships 10 | """ 11 | 12 | id: str 13 | href: Optional[str] 14 | duration: Union[int, float] 15 | title: str 16 | type: str 17 | 18 | 19 | class CourseStep(BaseModel): 20 | id: str 21 | isOptional: bool 22 | activities: list[Activity] # Usually has one activity, containing the actual title 23 | allActivitiesRequired: bool 24 | 25 | 26 | class CourseSubmodule(BaseModel): 27 | id: str 28 | title: str 29 | description: Optional[str] 30 | steps: list[CourseStep] 31 | expanded: bool 32 | 33 | def __hash__(self) -> int: 34 | return int(self.id) 35 | 36 | 37 | class Course(BaseModel): 38 | __root__: list[CourseSubmodule] # __root__ == 🌟 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 mani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /app/llm-poc-variant-02/faiss_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from langchain.text_splitter import CharacterTextSplitter 4 | from langchain.embeddings.openai import OpenAIEmbeddings 5 | from langchain.document_loaders import TextLoader 6 | from langchain.vectorstores import FAISS 7 | from langchain.chains import RetrievalQA 8 | from langchain.llms import OpenAI 9 | 10 | def faiss_index(): 11 | current_directory = os.getcwd() 12 | data_path = current_directory + "\\final_project\\Learning_Pathway_Index.csv" 13 | loader = TextLoader(data_path) 14 | documents = loader.load() 15 | text_splitter = CharacterTextSplitter( 16 | chunk_size=1000, chunk_overlap=30, separator="\n" 17 | ) 18 | docs = text_splitter.split_documents(documents=documents) 19 | 20 | embeddings = OpenAIEmbeddings() 21 | vectorstore = FAISS.from_documents(docs, embeddings) 22 | vectorstore.save_local("faiss_learning_path_index") 23 | 24 | new_vectorstore = FAISS.load_local("faiss_learning_path_index", embeddings) 25 | qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=new_vectorstore.as_retriever()) 26 | res = qa.run("Give me Machine Learning Course with 10 or 20 min duration.") 27 | print(res) 28 | 29 | 30 | if __name__ == "__main__": 31 | faiss_index() 32 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/docker/run-docker-container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -u 5 | set -o pipefail 6 | 7 | DOCKER_USER_NAME="neomatrix369" 8 | FULL_DOCKER_TAG_NAME="python-3.10-docker-env" 9 | echo "Running image ${FULL_DOCKER_TAG_NAME}"; echo "" 10 | 11 | pullImage() { 12 | FULL_DOCKER_TAG_NAME="${DOCKER_USER_NAME}/${FULL_DOCKER_TAG_NAME}" 13 | 14 | docker pull ${FULL_DOCKER_TAG_NAME} || true 15 | } 16 | 17 | WORKDIR="/home/" 18 | LOCAL_MODEL_FOLDER="$(pwd)/../" 19 | MODEL_VOLUME_SHARED="--volume ${LOCAL_MODEL_FOLDER}:${WORKDIR}" 20 | OLLAMA_VOLUME_SHARED="--volume $(which ollama):/usr/bin/ollama" 21 | HF_CACHE_SHARED="--volume ${LOCAL_MODEL_FOLDER}/.cache:/root/.cache" 22 | 23 | set -x 24 | 25 | # pullImage 26 | time docker run --rm -it --network="host" \ 27 | --platform="linux/amd64" \ 28 | --network="host" \ 29 | --add-host=host.docker.internal:host-gateway \ 30 | --workdir "${WORKDIR}" \ 31 | --env OLLAMA_HOST="http://host.docker.internal:11434" \ 32 | ${HF_CACHE_SHARED} \ 33 | ${MODEL_VOLUME_SHARED} \ 34 | ${OLLAMA_VOLUME_SHARED} \ 35 | "${FULL_DOCKER_TAG_NAME}" 36 | set +x 37 | 38 | echo "* Finished running docker image ${FULL_DOCKER_TAG_NAME}" -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/aws/outputs.tf: -------------------------------------------------------------------------------- 1 | output "arn" { 2 | description = "The ARN of the instance" 3 | value = try(aws_instance.lpi-cpu-vm.arn, "") 4 | } 5 | 6 | output "public_dns" { 7 | description = "This public DNS name assigned to the instance" 8 | value = try(aws_instance.lpi-cpu-vm.public_dns, "") 9 | } 10 | 11 | output "public_ip" { 12 | description = "This public IP address assigned to the instance" 13 | value = try(aws_instance.lpi-cpu-vm.public_ip, "") 14 | } 15 | 16 | output "private_dns" { 17 | description = "This private DNS name assigned to the instance" 18 | value = try(aws_instance.lpi-cpu-vm.private_dns, "") 19 | } 20 | 21 | output "private_ip" { 22 | description = "This private IP address assigned to the instance" 23 | value = try(aws_instance.lpi-cpu-vm.private_ip, "") 24 | } 25 | 26 | 27 | # Outputs the id of the subnet you created in the module 28 | #output "subnet_id" { 29 | # value = try(aws_subnet.this.id, "") 30 | #} 31 | 32 | # Outputs the value of the 33 | # /aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2 parameter. 34 | #output "ami_id" { 35 | # value = try(data.aws_ssm_parameter.this.value, "") 36 | #} 37 | 38 | output "tags_all" { 39 | description = "A map of tags assigned to the resource" 40 | value = try(aws_instance.lpi-cpu-vm.tags_all, "") 41 | } 42 | -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/google_cloud_skill_boost/pages.py: -------------------------------------------------------------------------------- 1 | """ 2 | Selectors for different HTML pages 3 | Plural attributes imply a list of elements are returned by the xpath, 4 | rather than a single element 5 | """ 6 | 7 | 8 | # NOTE: GCSB = Google Cloud Skill Boost 9 | class GCSBSignInPage: 10 | user_email = '//input[@id="user_email"]' 11 | user_password = '//input[@id="user_password"]' 12 | sign_in_button = '//button[@data-analytics-action="clicked_sign_in"]' 13 | 14 | 15 | class GCSBLearningJourneyPage: 16 | """ 17 | E.g https://www.cloudskillsboost.google/journeys/183) 18 | """ 19 | 20 | journeys = "//div[@class='activity-card']" 21 | journey_title = ".//h2[2]/text()" 22 | journey_details = ".//div[@class='activity-details']//div[contains(@class, 'ql-subhead-1')]/text()" 23 | journey_description = ".//p/text()" 24 | journey_link = ".//ql-button[contains(text(), 'Learn more')]/@href" 25 | 26 | 27 | class GCSBCourseTemplatePage: 28 | """ 29 | Skill Boost Course page 30 | E.g https://www.cloudskillsboost.google/course_templates/541 31 | """ 32 | 33 | course_title = "//h1[@class='ql-headline-1']" 34 | prework = "(//div[div/text() = 'Prerequisites'])/following-sibling::div/text()" 35 | 36 | 37 | class GCSBFocusPage: 38 | """ 39 | E.g https://www.cloudskillsboost.google/focuses/71938?parent=catalog 40 | """ 41 | 42 | ... 43 | -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/kaggle_learn/README.md: -------------------------------------------------------------------------------- 1 | # Kaggle Learn Course Scraper 2 | 3 | This folder contains scripts for scraping courses from the Kaggle Learn website using the Kaggle Learn API. The scripts retrieve course information in JSON format without the need for HTML parsing. 4 | 5 | ## Scripts 6 | 7 | 1. **`scrape_all_courses.py`**: This script makes use of the Kaggle Learn API to scrape all available courses from the platform in a single API request. It's the recommended script to use when scraping a comprehensive list of courses. 8 | 9 | 2. **`scrape_course.py`**: This script is provided for illustrative purposes. It demonstrates how to scrape course information using the Kaggle Learn API on a per-course basis. 10 | 11 | ## Getting Started 12 | 13 | To get started with course scraping, you can choose between the two scripts mentioned above based on your requirements. 14 | 15 | ### Prerequisites 16 | 17 | Make sure you have Python installed on your system. 18 | 19 | ### Usage 20 | 21 | 1. Clone this repository 22 | 23 | 2. Navigate to the repository folder: 24 | 25 | ```bash 26 | cd course-scraper/src 27 | ``` 28 | 29 | 3. Run the desired script: 30 | 31 | ```bash 32 | python -m scrapers.kaggle_learn.scrape_all_courses 33 | ``` 34 | 35 | or 36 | 37 | ```bash 38 | python -m scrapers.kaggle_learn.scrape_all_course 39 | ``` 40 | 41 | ## Config 42 | 43 | The folder where the output is stored can be changed by modifying `course-scraper/config.py` -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/kaggle_learn/scrape_course.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scrape Courses from Kaggle Learn 3 | This script makes use of the internal Kaggle Learn API to retrieve course information 4 | Without parsing any HTML 5 | """ 6 | 7 | from pathlib import Path 8 | 9 | import requests 10 | from config import CONFIG 11 | from scrapers.kaggle_learn.models import KaggleCourse 12 | 13 | KAGGLE_COURSE_API_URL = ( 14 | "https://www.kaggle.com/api/i/education.EducationService/GetTrack" 15 | ) 16 | 17 | KAGGLE_DATA_PATH = Path(CONFIG.DATA_PATH, "KaggleLearnCourses") 18 | 19 | 20 | def get_course_details(url: str) -> dict: 21 | """ 22 | Get details of a Kaggle Learn course 23 | e.g https://www.kaggle.com/learn/feature-engineering 24 | """ 25 | session = requests.Session() 26 | # Make a preparatory request to get relevant cookies 27 | session.get(url) 28 | xsrf_token = session.cookies.get("XSRF-TOKEN") 29 | track_slug = url.split("/")[-1] 30 | r = session.post( 31 | KAGGLE_COURSE_API_URL, 32 | headers={"X-Xsrf-Token": xsrf_token, "Content-Type": "application/json"}, 33 | json={"trackSlug": track_slug}, 34 | ) 35 | 36 | return r.json() 37 | 38 | 39 | course = KaggleCourse.parse_obj( 40 | get_course_details("https://www.kaggle.com/learn/feature-engineering") 41 | ) 42 | 43 | with open( 44 | KAGGLE_DATA_PATH.joinpath("feature-engineering-course.csv"), "w", encoding="utf-8" 45 | ) as f: 46 | course.write_course_summary_to_file(f) 47 | -------------------------------------------------------------------------------- /app/llm-gemma-variant/src/model.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from llama_index.core.callbacks import CallbackManager 4 | from llama_index.core.llms import ( 5 | CustomLLM, 6 | CompletionResponse, 7 | CompletionResponseGen, 8 | LLMMetadata, 9 | ) 10 | from llama_index.core.llms.callbacks import llm_completion_callback 11 | from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, PromptTemplate 12 | 13 | # This creates a custom llm for more control over our model 14 | class Gemma(CustomLLM): 15 | num_output: int = 8192 16 | model_name: str = "Gemma" 17 | model: Any = None 18 | 19 | def __init__(self, model, num_output): 20 | super(Gemma, self).__init__() 21 | self.model = model 22 | self.num_output = num_output 23 | 24 | @property 25 | def metadata(self) -> LLMMetadata: 26 | """Get LLM metadata.""" 27 | return LLMMetadata( 28 | num_output=self.num_output, 29 | model_name=self.model_name, 30 | ) 31 | 32 | @llm_completion_callback() 33 | def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse: 34 | return self.model.complete(prompt, max_length=self.num_output) 35 | 36 | @llm_completion_callback() 37 | def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen: 38 | response = "" 39 | for token in self.model.generate(prompt, max_length=self.num_output): 40 | response += token 41 | yield CompletionResponse(text=response, delta=token) -------------------------------------------------------------------------------- /data/utils/README.md: -------------------------------------------------------------------------------- 1 | # Get Metadata for a Kaggle Dataset 2 | 3 | The `get-kaggle-dataset-meta-data.py` python script fetches the metadata for the [_Learning Path Index_ Kaggle Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset). 4 | 5 | ## Pre-requisites 6 | 7 | - Python 3.10 or higher 8 | - Docker (to run inside docker containers) 9 | - Shell-scripting (basic skills) 10 | - Kaggle 11 | 12 | **Steps** 13 | 14 | - Setup your `.bashrc` or `.zshrc` or Windows environment with the below environment variables: 15 | 16 | ```bash 17 | export KAGGLE_USERNAME="[your kaggle username]" 18 | export KAGGLE_KEY="[your kaggle API key]" 19 | ``` 20 | 21 | - See [Kaggle API Docs](https://www.kaggle.com/docs/api) the lastest docs, but for specific queries like 22 | [How to Obtain a Kaggle API Key](https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/#generate-your-kaggle-api-key), [Christian Mill's Kaggle Docs](https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/) are also useful. 23 | 24 | - Install dependencies by running: 25 | 26 | ```bash 27 | pip install requirements.txt 28 | ``` 29 | 30 | ## Usage 31 | 32 | ```bash 33 | cd [into this folder] 34 | python get-kaggle-dataset-meta-data.py 35 | ``` 36 | 37 | This creates the metadata json file in the parent folder by the name `dataset-metadata.json`. 38 | 39 | ## Docs 40 | 41 | - [Kaggle API docs](https://www.kaggle.com/docs/api) 42 | - [Dataset Metadata](https://github.com/Kaggle/kaggle-api/wiki/Dataset-Metadata) 43 | - [Kaggle Wiki](https://github.com/Kaggle/kaggle-api/wiki) 44 | -------------------------------------------------------------------------------- /app/llm-gemma-variant/src/backend.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | from llama_index.core import Settings 4 | from loguru import logger 5 | from llama_index.llms.ollama import Ollama 6 | from model import Gemma 7 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 8 | from vector_db import VectorDB 9 | from pathlib import Path 10 | 11 | 12 | 13 | def main(): 14 | # Load and verify API key 15 | load_dotenv('credentials.env') 16 | 17 | # Initialize the model 18 | gemma2_2b = Ollama(model="gemma2:2b") 19 | llm_model = Gemma(gemma2_2b, 2000) 20 | Settings.llm = llm_model 21 | 22 | # Initialize embedding model 23 | Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") 24 | 25 | # Getting data path 26 | current_directory = os.getcwd() 27 | data_path = os.path.join(current_directory, r"LPI_folder/Learning_Pathway_Index.csv") 28 | 29 | # Initialize the VectorDB class 30 | weaviate_vector_db = VectorDB( 31 | data_path=data_path, 32 | index_name="Learning_path_index" 33 | ) 34 | 35 | # Create the vector database 36 | index = weaviate_vector_db.vector_db_creation() 37 | 38 | # Initialize RAG 39 | naive_rag_query_engine = index.as_query_engine() 40 | 41 | # Run your naive RAG query 42 | response = naive_rag_query_engine.query("What courses should I take if i want to learn about finetuning?") 43 | 44 | logger.info(response.response) 45 | 46 | # Disconnect from the Weaviate vector database 47 | weaviate_vector_db.disconnect() 48 | 49 | if __name__ == "__main__": 50 | main() 51 | 52 | -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/kaggle_learn/scrape_all_courses.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scrape Courses from Kaggle Learn 3 | This script makes use of the internal Kaggle Learn API to retrieve course information 4 | Without parsing any HTML 5 | """ 6 | 7 | from pathlib import Path 8 | 9 | import requests 10 | from config import CONFIG 11 | from pydantic import BaseModel 12 | from scrapers.kaggle_learn.models import ( 13 | KaggleCourse, 14 | ) 15 | from utils import get_safestring 16 | 17 | KAGGLE_COURSE_API_URL = ( 18 | "https://www.kaggle.com/api/i/education.EducationService/GetTracks" 19 | ) 20 | 21 | KAGGLE_DATA_PATH = Path(CONFIG.DATA_PATH, "KaggleLearnCourses") 22 | KAGGLE_DATA_PATH.mkdir(exist_ok=True, parents=True) 23 | 24 | 25 | def get_page_details(url: str) -> dict: 26 | """ 27 | Get all courses and their details from Kaggle Learn Homepage https://www.kaggle.com/learn/ 28 | """ 29 | session = requests.Session() 30 | # Make a preparatory request to get relevant cookies 31 | session.get(url) 32 | xsrf_token = session.cookies.get("XSRF-TOKEN") 33 | r = session.post( 34 | KAGGLE_COURSE_API_URL, 35 | headers={"X-Xsrf-Token": xsrf_token, "Content-Type": "application/json"}, 36 | json={}, 37 | ) 38 | 39 | return r.json() 40 | 41 | 42 | class AllKaggleCourses(BaseModel): 43 | tracks: list[KaggleCourse] 44 | 45 | 46 | page = AllKaggleCourses.parse_obj( 47 | get_page_details("https://www.kaggle.com/learn/feature-engineering") 48 | ) 49 | 50 | for course in page.tracks: 51 | file_name = get_safestring(course.name) 52 | with open( 53 | KAGGLE_DATA_PATH.joinpath(f"{file_name}.csv"), "w", encoding="utf-8" 54 | ) as f: 55 | course.write_course_summary_to_file(f) 56 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/aws/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_instance" "lpi-cpu-vm" { 2 | ami = "ami-065deacbcaac64cf2" //Ubuntu AMI 3 | ### https://aws.amazon.com/ec2/instance-types/ 4 | ###### t2.xlarge = CPU based 5 | instance_type = "t2.2xlarge" 6 | ebs_block_device { 7 | device_name = "/dev/sda1" 8 | volume_size = 20 9 | } 10 | 11 | tags = { 12 | Name = "LPI Instance (CPU/vm)" 13 | } 14 | 15 | key_name = aws_key_pair.lpi-key.key_name 16 | security_groups = ["lpi-sg"] 17 | 18 | connection { 19 | type = "ssh" 20 | ### Important to set this to the correct user, as for AMI Ubuntu/Linux boxes 21 | ### the default name is 'ubuntu', and NOT 'ec2-user' 22 | user = "ubuntu" 23 | private_key = var.ssh_private_key 24 | password = "" 25 | host = self.public_ip 26 | } 27 | 28 | provisioner "remote-exec" { 29 | inline = [ 30 | "sudo apt-get update -y", 31 | "sudo apt install -y ca-certificates curl gnupg lsb-release", 32 | "sudo apt-get update -y", 33 | "curl -fsSL https://get.docker.com -o get-docker.sh", 34 | "sudo sh get-docker.sh", 35 | "sudo groupadd -f docker", 36 | "sudo usermod -aG docker $USER", 37 | "docker -v || true", 38 | "curl https://ollama.ai/install.sh | sh", 39 | "ollama serve", 40 | "ollama pull llama2-uncensored", 41 | "echo; ollama list; echo", 42 | "git clone https://github.com/neomatrix369/learning-path-index", 43 | "cd learning-path-index/app/llm-poc-variant-01/", 44 | "mkdir -p source_documents", 45 | "curl https://raw.githubusercontent.com/neomatrix369/learning-path-index/main/data/Learning_Pathway_Index.csv -o 'source_documents/Learning Pathway Index.csv'" 46 | ] 47 | } 48 | } -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/google_cloud_skill_boost/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | 1. **Setup Your Virtual Environment** 4 | - Create a new virtual environment: 5 | 6 | ```bash 7 | python -m venv venv 8 | source venv/bin/activate 9 | 10 | #Windows 11 | venv/Scripts/activate 12 | ``` 13 | 14 | 2. **Install Dependencies** 15 | 16 | - Change to the appropriate directory 17 | 18 | ```bash 19 | cd C:/{path}/learning-path-index/app/course-scraper 20 | ``` 21 | 22 | 23 | - Run the following command to install the required dependencies: 24 | 25 | ```bash 26 | pip install -r requirements.txt 27 | ``` 28 | 29 | 3. **Run the Scripts** 30 | 31 | - **Scrape a Journey** (Run this first for each journey) 32 | - Example: 33 | To scrape the ML Engineer Path (https://www.cloudskillsboost.google/journeys/183) 34 | modify the config variables in `scrape_journey.py` and run 35 | ```bash 36 | python -m scrapers.google_cloud_skill_boost.scrape_journey 37 | ``` 38 | 39 | - **Scrape a Course Template** 40 | - Example: 41 | To scrape the details of all the courses in the ML Engineer Path (Details of Learning Paths are termed course templates e.g https://www.cloudskillsboost.google/course_templates/541), 42 | ```bash 43 | python -m scrapers.scrapers.google_cloud_skill_boost.scrape_course_template 44 | ``` 45 | 46 | - **TODO: Scrape a Lab/Focus** 47 | - Example: 48 | To scrape the details of a lab (An example lab is https://www.cloudskillsboost.google/focuses/71938?parent=catalog) 49 | ```bash 50 | python -m scrapers.scrapers.google_cloud_skill_boost.scrape_focus 51 | ``` 52 | 53 | ## Configuration 54 | You can modify most of the scraping behavior and parameters by editing the `config.py` file. 55 | -------------------------------------------------------------------------------- /app/llm-poc-variant-02/README.md: -------------------------------------------------------------------------------- 1 | # kagglex-final-project 2 | 3 | A prototype written in Python to illustrate/demonstrate querying the Learning Path Index Dataset (see [Kaggle Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset) and [GitHub repo](https://github.com/neomatrix369/learning-path-index)), with the help of the OpenAI GPT technology (InstructHPT model and embeddings model), [Langchain](https://python.langchain.com/) and using [Facebook's FAISS library](https://faiss.ai/). 4 | 5 | 6 | ![image](https://github.com/mbhoge/kagglex-final-project/assets/988040/5396aee3-cf0f-43b6-9b44-aaf779ed803a) 7 | 8 | 9 | The end-to-end process can be learnt by going through the code base as well as by observing the console logs when using both the Streamlit and the CLI versions. 10 | 11 | ## Pre-requisites 12 | 13 | - Python 3.8.x or above 14 | - OpenAI API Key (see [How to get an OpenAI API Key](https://www.howtogeek.com/885918/how-to-get-an-openai-api-key/) -- note it's may not be FREE anymore) 15 | - Install dependencies from `requirements.txt` 16 | - Basic Command-line experience 17 | - Basic git and GitHub experience 18 | 19 | ## Install and run 20 | 21 | Copy the `.env_template` to `.env` in the current folder and then add your OpenAI API Key to `.env`. 22 | **Please don't modify the `.env_template` file.** 23 | 24 | 25 | ```bash 26 | pip install -r requirements.txt 27 | ``` 28 | 29 | ### Interactive session via CLI app 30 | 31 | ```bash 32 | python main.py 33 | ``` 34 | 35 | ![image](https://github.com/mbhoge/kagglex-final-project/assets/1570917/9bb04765-623d-452a-bcd0-82abf74ce6a9) 36 | 37 | 38 | ### Interactive session via Streamlit app 39 | 40 | ```bash 41 | streamlit run main.py 42 | ``` 43 | 44 | ![image](https://github.com/mbhoge/kagglex-final-project/assets/1570917/714eabc6-90bf-4e48-bf45-f2c8a307bf5a) 45 | 46 | --- 47 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/docker/build-docker-image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -u 5 | set -o pipefail 6 | 7 | DOCKER_USER_NAME="${DOCKER_USER_NAME:-neomatrix369}" 8 | FULL_DOCKER_TAG_NAME="python-3.10-docker-env" 9 | 10 | cleanup() { 11 | containersToRemove=$(docker ps --quiet --filter "status=exited") 12 | [ ! -z "${containersToRemove}" ] && \ 13 | echo "Remove any stopped container from the local registry" && \ 14 | docker rm ${containersToRemove} || true 15 | 16 | imagesToRemove=$(docker images --quiet --filter "dangling=true") 17 | [ ! -z "${imagesToRemove}" ] && \ 18 | echo "Remove any dangling images from the local registry" && \ 19 | docker rmi -f ${imagesToRemove} || true 20 | } 21 | 22 | pushImageToHub() { 23 | echo "Pushing image ${FULL_DOCKER_TAG_NAME} to Docker Hub"; echo "" 24 | 25 | if ! docker login --username=${DOCKER_USER_NAME}; then 26 | echo "Failed to login to Docker Hub" 27 | exit 1 28 | fi 29 | pushImage ${FULL_DOCKER_TAG_NAME} 30 | } 31 | 32 | findImage() { 33 | IMAGE_NAME="${1}" 34 | echo $(docker images ${IMAGE_NAME} -q | head -n1 || true) 35 | } 36 | 37 | pushImage() { 38 | IMAGE_NAME="${1}" 39 | FULL_DOCKER_TAG_NAME="${DOCKER_USER_NAME}/${IMAGE_NAME}" 40 | 41 | IMAGE_FOUND="$(findImage ${IMAGE_NAME})" 42 | IS_FOUND="found" 43 | if [[ -z "${IMAGE_FOUND}" ]]; then 44 | IS_FOUND="not found" 45 | fi 46 | echo "Docker image '${DOCKER_USER_NAME}/${IMAGE_NAME}' is ${IS_FOUND} in the local repository" 47 | 48 | docker tag ${IMAGE_FOUND} ${FULL_DOCKER_TAG_NAME} 49 | docker push ${FULL_DOCKER_TAG_NAME} 50 | } 51 | 52 | 53 | echo "Building image ${FULL_DOCKER_TAG_NAME}"; echo "" 54 | 55 | WORKDIR="/home/" 56 | cleanup 57 | 58 | cp ../requirements.txt . 59 | set -x 60 | REQUESTS_CA_BUNDLE="$(ls *.pem || true)" 61 | time docker build \ 62 | --build-arg WORKDIR=${WORKDIR} \ 63 | --build-arg REQUESTS_CA_BUNDLE=${REQUESTS_CA_BUNDLE} \ 64 | -t ${FULL_DOCKER_TAG_NAME} \ 65 | . 66 | set +x 67 | rm -f requirements.txt 68 | 69 | echo "* Finished building docker image ${FULL_DOCKER_TAG_NAME}" 70 | 71 | pushImageToHub 72 | 73 | cleanup -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/gcp/main.tf: -------------------------------------------------------------------------------- 1 | resource "google_compute_instance" "lpi-cpu-vm" { 2 | name = "lpi-llm-cpu" 3 | machine_type = "n2-standard-8" 4 | zone = var.zone 5 | 6 | boot_disk { 7 | initialize_params { 8 | image = "ubuntu-os-cloud/ubuntu-2404-lts-amd64" 9 | size = 30 10 | } 11 | } 12 | 13 | network_interface { 14 | network = "default" 15 | 16 | access_config { 17 | // Include this section to give the VM an external IP address 18 | } 19 | } 20 | 21 | metadata_startup_script = <<-EOF 22 | sudo apt-get update -y 23 | sudo apt install -y ca-certificates curl gnupg lsb-release 24 | sudo apt-get update -y 25 | 26 | sudo apt install python3-pip python3.12-venv -y 27 | 28 | # Install Docker 29 | curl -fsSL https://get.docker.com -o get-docker.sh 30 | sudo sh get-docker.sh 31 | sudo groupadd -f docker 32 | sudo usermod -aG docker $USER 33 | docker -v || true 34 | 35 | # Install Ollama 36 | curl https://ollama.ai/install.sh | sh 37 | 38 | # Add a startup service for Ollama 39 | sudo useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama 40 | sudo usermod -a -G ollama $(whoami) 41 | cp ollama.service /etc/systemd/system/ollama.service 42 | sudo systemctl daemon-reload 43 | sudo systemctl enable ollama 44 | 45 | sudo systemctl start ollama 46 | sudo systemctl status ollama 47 | 48 | ollama pull gemma:2b 49 | 50 | sudo git clone https://github.com/neomatrix369/learning-path-index /learning-path-index 51 | sudo chmod ugo+rwx /learning-path-index 52 | git config --global --add safe.directory /learning-path-index 53 | cd /learning-path-index/app/llm-poc-variant-01/ 54 | git checkout gcp-terraform-deploy 55 | 56 | mkdir -p source_documents 57 | curl https://raw.githubusercontent.com/neomatrix369/learning-path-index/main/data/Learning_Pathway_Index.csv -o 'source_documents/Learning Pathway Index.csv' 58 | 59 | python3 -m venv venv 60 | . venv/bin/activate 61 | python3 -m pip install -r requirements.txt 62 | 63 | chainlit run chainlit_app.py --host 0.0.0.0 --port 8000 64 | EOF 65 | 66 | tags = [ 67 | "lpi-sg", 68 | "http-server", 69 | "https-server" 70 | ] 71 | } 72 | -------------------------------------------------------------------------------- /app/course-scraper/README.md: -------------------------------------------------------------------------------- 1 | # Course Scraper Module 2 | 3 | The Course Scraper Module is a versatile tool designed to fetch course information and duration from various online learning platforms. It simplifies the process of finding the right resource for your learning needs by providing a unified interface for accessing course details from supported platforms. 4 | 5 | ## Supported Platforms 6 | 7 | - [x] Google Developer Courses 8 | - [x] Fast.ai ML Course 9 | - [x] IBM - AI & Ethics Course 10 | - [ ] Google Cloud Skill Boost: Machine Learning Engineer 11 | - [x] Google Cloud Skill Boost: Data Learning Engineer 12 | - [ ] Google Cloud Skill Boost: Data Analyst 13 | - [x] Google Cloud Skill Boost: Generative AI 14 | - [ ] Google Cloud Skill Boost: AD-HOC Courses 15 | - [ ] [Kaggle Learn Courses](./src/scrapers/kaggle_learn) 16 | - [ ] Deeplearning.ai Courses 17 | 18 | ## Getting Started 19 | 20 | To get started with the Course Scraper Module, follow these steps: 21 | 22 | 1. **Clone the Repository:** 23 | 24 | Clone this GitHub repository to your local machine: 25 | 26 | 27 | 2. **Navigate to the Course Scraper Module:** 28 | 29 | Change your current working directory to the course-scraper subfolder within the cloned repository: 30 | 31 | ```bash 32 | cd learning-path-index/course-scraper 33 | ``` 34 | 35 | 3. **Install Dependencies:** 36 | 37 | Ensure you have all the required dependencies installed. You can do this using pip: 38 | 39 | ```bash 40 | pip install -r requirements.txt 41 | ``` 42 | 43 | 4. **Run the Scraper:** 44 | 45 | Scrapers specific to each platform can be found in `course-scraper/src/scrapers` folder. 46 | Would you like to scrape courses from *Kaggle Learn*? 47 | Checkout the [Kaggle Learn scraper README.md](). 48 | How about *Google Cloud Skill Boost*? 49 | Checkout the [GCSB scraper README.md](). 50 | 51 | 52 | Generally scrapers can be run by navigating to the `course-scraper/src` folder, and running 53 | ```bash 54 | python -m scrapers.. 55 | ``` 56 | 57 | e.g 58 | 59 | ```bash 60 | python -m scrapers.kaggle_learn.scrape_all_courses 61 | ``` 62 | 63 | 64 | 5. **View the Results:** 65 | 66 | The scraper will provide the course details and duration in a structured format. In the folder determined by `config.py` 67 | 68 | ## Usage 69 | 70 | ### Scraper Configuration 71 | 72 | You can configure the general behaviour of all scrapers by modifying the `config.py` file. This file allows you to specify: 73 | - output location ✅ 74 | - the output format (TODO: 🚧), 75 | 76 | and other settings. 77 | 78 | ## Contributing 79 | 80 | We welcome contributions to enhance and expand the Course Scraper Module. If you'd like to contribute, please follow these guidelines: 81 | 82 | 1. Fork the repository. 83 | 2. Create a new branch for your feature or bug fix. 84 | 3. Make your changes and ensure that the code passes all tests. 85 | 4. Submit a pull request with a clear description of your changes and their purpose. 86 | 87 | 88 | Happy learning and scraping! -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/google_cloud_skill_boost/scrape_journey.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import os 4 | from csv import DictWriter 5 | from pathlib import Path 6 | from urllib.parse import urljoin 7 | 8 | import requests 9 | from config import CONFIG 10 | from lxml import etree 11 | from scrapers.google_cloud_skill_boost import pages 12 | 13 | COURSE_CODE = 'CLMML11' 14 | GCSB_HOME_URL = 'https://www.cloudskillsboost.google/' 15 | GCSB_LOGIN_URL = 'https://www.cloudskillsboost.google/users/sign_in' 16 | 17 | DATA_FOLDER = Path(CONFIG.DATA_PATH, COURSE_CODE) 18 | DATA_FOLDER.mkdir(exist_ok=True, parents=True) 19 | 20 | 21 | # Open Journey Path 22 | def extract_ml_learning_path(GCSB_JOURNEY_URL) -> list[dict]: 23 | r = requests.get(GCSB_JOURNEY_URL) 24 | dom = etree.fromstring(r.content, etree.HTMLParser()) 25 | 26 | data = [] 27 | for journey in dom.xpath(pages.GCSBLearningJourneyPage.journeys): 28 | details = journey.xpath(pages.GCSBLearningJourneyPage.journey_details) 29 | details = details[0] if details else 'No details available' 30 | 31 | link = journey.xpath(pages.GCSBLearningJourneyPage.journey_link) 32 | link = urljoin(GCSB_HOME_URL, link[0]) if link else 'No link available' 33 | 34 | data.append( 35 | { 36 | 'title': journey.xpath(pages.GCSBLearningJourneyPage.journey_title)[ 37 | 0 38 | ].strip() 39 | if journey.xpath(pages.GCSBLearningJourneyPage.journey_title) 40 | else 'No title available', 41 | 'details': details.strip(), 42 | 'description': journey.xpath( 43 | pages.GCSBLearningJourneyPage.journey_description 44 | )[0].strip() 45 | if journey.xpath(pages.GCSBLearningJourneyPage.journey_description) 46 | else 'No description available', 47 | 'link': link, 48 | } 49 | ) 50 | 51 | return data 52 | 53 | 54 | parser = argparse.ArgumentParser(description='Extract ML learning path') 55 | parser.add_argument('--url', help='GCSB Journey URL') 56 | args = parser.parse_args() 57 | 58 | GCSB_JOURNEY_URL = ( 59 | args.url 60 | or os.getenv('GCSB_JOURNEY_URL') 61 | or CONFIG.GCSB_JOURNEY_URL 62 | or input('Please enter the GCSB Journey URL: ') 63 | ) 64 | 65 | data = extract_ml_learning_path(GCSB_JOURNEY_URL) 66 | 67 | if data: 68 | try: 69 | with open( 70 | DATA_FOLDER.joinpath(f'{COURSE_CODE}-Courses.csv'), 71 | 'w', 72 | encoding='utf-8', 73 | newline='', 74 | ) as f: 75 | DictWriter( 76 | f, fieldnames=['title', 'details', 'description', 'link'] 77 | ).writerows(data) 78 | print(f'Data successfully written to {COURSE_CODE}-Courses.csv') 79 | except IOError as e: 80 | print(f'An I/O error occurred while writing the file: {e}') 81 | except csv.Error as e: 82 | print(f'A CSV-related error occurred: {e}') 83 | except Exception as e: 84 | print(f'An unexpected error occurred while writing the file: {e}') 85 | else: 86 | print('No data to write!') 87 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/chainlit_app.py: -------------------------------------------------------------------------------- 1 | import chainlit as cl 2 | from chainlit.input_widget import Select, Slider 3 | from constants import PERSIST_DIRECTORY 4 | from langchain.prompts import ChatPromptTemplate 5 | from langchain.schema import StrOutputParser 6 | from langchain.schema.runnable import RunnablePassthrough 7 | from langchain.schema.runnable.config import RunnableConfig 8 | from lpiGPT import build_model, build_retriever 9 | 10 | 11 | def build_runnable_from_settings(settings: dict): 12 | retriever = build_retriever( 13 | model_embeddings=settings['ModelEmbeddings'], 14 | persist_directory=PERSIST_DIRECTORY, 15 | ) 16 | _qa, llm = build_model(retriever, model_name=settings['ModelName']) 17 | prompt = ChatPromptTemplate.from_messages( 18 | [ 19 | ( 20 | 'system', 21 | """ 22 | You are an assistant for question-answering tasks using the Learning Path Index. 23 | Show the results in a table or tabular form, and the results must contain a link for each line of the courses, modules or sub-modules returned. 24 | """, 25 | ), 26 | ('human', '{question}'), 27 | ] 28 | ) 29 | 30 | runnable = ( 31 | {'context': retriever | format_docs, 'question': RunnablePassthrough()} 32 | | prompt 33 | | llm 34 | | StrOutputParser() 35 | ) 36 | return runnable 37 | 38 | 39 | @cl.on_settings_update 40 | async def setup_agent(settings): 41 | await cl.Message( 42 | content='Updating settings', 43 | ).send() 44 | runnable = build_runnable_from_settings(settings) 45 | cl.user_session.set('settings', settings) 46 | cl.user_session.set('runnable', runnable) 47 | 48 | 49 | def format_docs(docs): 50 | return '\n\n'.join(doc.page_content for doc in docs) 51 | 52 | 53 | @cl.on_chat_start 54 | async def on_chat_start(): 55 | settings = await cl.ChatSettings( 56 | [ 57 | Select( 58 | id='ModelName', 59 | label='Chat Model', 60 | values=['gemma:2b', 'llama2-uncensored'], 61 | initial_index=0, 62 | ), 63 | Select( 64 | id='ModelEmbeddings', 65 | label='Model Embeddings', 66 | values=['all-MiniLM-L6-v2'], 67 | initial_index=0, 68 | ), 69 | Slider( 70 | id='TargetSourceChunks', 71 | label='Target Source Chunks', 72 | initial=500, 73 | min=250, 74 | max=2000, 75 | step=50, 76 | ), 77 | ] 78 | ).send() 79 | runnable = build_runnable_from_settings(settings) 80 | 81 | cl.user_session.set('settings', settings) 82 | cl.user_session.set('runnable', runnable) 83 | 84 | 85 | @cl.on_message 86 | async def on_message(message: cl.Message): 87 | runnable = cl.user_session.get('runnable') 88 | 89 | msg = cl.Message(content='') 90 | 91 | for chunk in await cl.make_async(runnable.stream)( 92 | message.content, 93 | config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]), 94 | ): 95 | await msg.stream_token(chunk) 96 | 97 | await msg.send() 98 | -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/google_cloud_skill_boost/scrape_course_template.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import io 3 | from collections import defaultdict 4 | from html import unescape 5 | from pathlib import Path 6 | 7 | import requests 8 | from config import CONFIG 9 | from lxml import etree 10 | from scrapers.google_cloud_skill_boost.models import Activity, Course, CourseSubmodule 11 | from utils import get_safestring 12 | 13 | COURSE_CODE = "CLMML11" 14 | 15 | DATA_FOLDER = Path(CONFIG.DATA_PATH, COURSE_CODE) 16 | DATA_FOLDER.mkdir(exist_ok=True, parents=True) 17 | 18 | course_modules_mapping = {} 19 | with open(DATA_FOLDER.joinpath(f"{COURSE_CODE}-Courses.csv")) as f: 20 | course_meta = io.StringIO(f.read()) 21 | 22 | csvreader = csv.DictReader(course_meta) 23 | for course in csvreader: 24 | # TODO: Support scraping GCB Labs 25 | if "labs" in course["title"].lower(): 26 | continue 27 | r = requests.get(course["link"]) 28 | print(str(r.content)[:100]) 29 | html_parser = etree.HTMLParser() 30 | dom = etree.fromstring(r.content, html_parser) 31 | 32 | prerequisites = None 33 | if prerequisites := dom.xpath( 34 | "(//div[div/text() = 'Prerequisites'])/following-sibling::div/text()" 35 | ): 36 | prerequisites = "".join(prerequisites[0]).replace("\n", " ") 37 | 38 | if course_modules := dom.xpath("//ql-course/@modules"): 39 | course_modules = course_modules[0] 40 | course_modules = unescape(course_modules) 41 | else: 42 | continue 43 | 44 | course_modules_mapping[course["title"]] = course_modules 45 | 46 | with open( 47 | DATA_FOLDER.joinpath(f"{COURSE_CODE}-Modules-Meta.csv"), 48 | "a", 49 | encoding="utf-8", 50 | ) as f: 51 | csvwriter = csv.writer(f) 52 | print(prerequisites) 53 | csvwriter.writerow([course["title"], course["link"], prerequisites]) 54 | 55 | for course_title, course_module in course_modules_mapping.items(): 56 | parsed_courses = [c for c in Course.parse_raw(course_module).__root__] 57 | submodule_activities: dict[CourseSubmodule, list[dict]] = defaultdict(list) 58 | 59 | # Link Submodules with their activities 60 | for submodule in parsed_courses: 61 | for step in submodule.steps: 62 | submodule_activities[submodule].extend( 63 | [activity.dict() for activity in step.activities] 64 | ) 65 | 66 | course_title = get_safestring(course_title) 67 | 68 | for submodule in submodule_activities: 69 | submodule_title = get_safestring(submodule.title) 70 | with open( 71 | DATA_FOLDER.joinpath(f"{submodule_title}.csv"), 72 | "w", 73 | encoding="utf-8", 74 | ) as f: 75 | fieldnames = Activity.__fields__.keys() 76 | csvwriter = csv.DictWriter(f, fieldnames) 77 | csvwriter.writeheader() 78 | csvwriter.writerows(submodule_activities[submodule]) 79 | 80 | with open(DATA_FOLDER.joinpath(f"{course_title}.csv"), "w", encoding="utf-8") as f: 81 | excluded_fields = {"steps", "expanded"} 82 | fieldnames = sorted(set(CourseSubmodule.__fields__.keys()) - excluded_fields) 83 | csvwriter = csv.DictWriter(f, fieldnames) 84 | csvwriter.writeheader() 85 | csvwriter.writerows( 86 | [submodule.dict(exclude=excluded_fields) for submodule in parsed_courses] 87 | ) 88 | -------------------------------------------------------------------------------- /app/course-scraper/src/scrapers/kaggle_learn/models.py: -------------------------------------------------------------------------------- 1 | from csv import writer 2 | from typing import Optional 3 | from urllib.parse import urljoin 4 | 5 | from pydantic import BaseModel, validator 6 | 7 | KAGGLE_LEARN_URL = "https://www.kaggle.com/learn/" 8 | KAGGLE_URL = "https://www.kaggle.com" 9 | 10 | EMPTY_CSV_ROW = [] 11 | 12 | 13 | def convert_relative_url_to_absolute( 14 | relative_url: str, domain: str = KAGGLE_LEARN_URL 15 | ) -> str: 16 | return urljoin(domain, relative_url) 17 | 18 | 19 | class KaggleLesson(BaseModel): 20 | class KaggleTutorial(BaseModel): 21 | name: str 22 | url: str # E.g "/code/ryanholbrook/what-is-feature-engineering" 23 | authorUsername: str 24 | 25 | @validator("url", each_item=True) 26 | def convert_to_absolute_url(cls, url): 27 | return convert_relative_url_to_absolute(url, domain=KAGGLE_URL) 28 | 29 | description: str 30 | learnTutorial: KaggleTutorial 31 | 32 | 33 | class KagglePrerequsite(BaseModel): 34 | name: str 35 | trackSlug: str 36 | 37 | @validator("trackSlug", each_item=True) 38 | def convert_to_absolute_url(cls, trackSlug): 39 | return convert_relative_url_to_absolute(trackSlug, domain=KAGGLE_LEARN_URL) 40 | 41 | 42 | class KaggleAuthor(BaseModel): 43 | displayName: str 44 | userName: str 45 | 46 | 47 | class KaggleCourse(BaseModel): 48 | name: str 49 | description: str 50 | estimatedTimeHours: int 51 | trackSlug: str 52 | lessons: list[KaggleLesson] 53 | prerequisites: Optional[list[KagglePrerequsite]] 54 | authors: list[KaggleAuthor] 55 | 56 | @validator("trackSlug", each_item=True) 57 | def convert_to_absolute_url(cls, trackSlug): 58 | return convert_relative_url_to_absolute(trackSlug, domain=KAGGLE_LEARN_URL) 59 | 60 | @property 61 | def processed_authors(self): 62 | return ",".join( 63 | [f"{author.userName}|{author.displayName}" for author in self.authors] 64 | ) 65 | 66 | def write_course_summary_to_file(self, f): 67 | csvwriter = writer(f) 68 | csvwriter.writerow(["name", "description", "duration", "url", "authors"]) 69 | csvwriter.writerows( 70 | [ 71 | [ 72 | self.name, 73 | self.description, 74 | self.estimatedTimeHours, 75 | self.trackSlug, 76 | self.processed_authors, 77 | ], 78 | EMPTY_CSV_ROW, 79 | ] 80 | ) 81 | 82 | if self.prerequisites: 83 | # Write prerequisites 84 | csvwriter.writerow( 85 | ["prerequisites"], 86 | ) 87 | csvwriter.writerows( 88 | [[p.name, p.trackSlug] for p in self.prerequisites] + EMPTY_CSV_ROW 89 | ) 90 | 91 | # Write lessons 92 | csvwriter.writerow(["lessons"]) 93 | csvwriter.writerow(["name", "description", "url", "authorUserName"]) 94 | csvwriter.writerows( 95 | [ 96 | [ 97 | lesson.learnTutorial.name, 98 | lesson.description, 99 | lesson.learnTutorial.url, 100 | lesson.learnTutorial.authorUsername, 101 | ] 102 | for lesson in self.lessons 103 | ] 104 | ) 105 | csvwriter.writerow(EMPTY_CSV_ROW) 106 | -------------------------------------------------------------------------------- /getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | This guide will help you set up and run the Learning Path Index (LPI) project on your local machine. 4 | 5 | ## Prerequisites 6 | 7 | Before you begin, ensure you have the following installed: 8 | 9 | - **Python (tested with version 3.x):** Download and install Python from [the official website](https://www.python.org/downloads/). 10 | - **Git:** Download and install Git from [the official website](https://git-scm.com/downloads). 11 | - **Docker (optional):** While not strictly necessary, Docker simplifies running certain components. Install Docker following the [instructions for your operating system](https://docs.docker.com/get-docker/). 12 | 13 | ## Installation 14 | 15 | The LPI project consists of several independent applications (applets) that work together: 16 | 17 | - **Web scraper** 18 | - **LLM Variant 01 (Ollama)** 19 | - **LLM Variant 02 (OpenAI)** 20 | 21 | **Note:** This guide focuses on setting up the general repository. Each applet may have additional instructions. 22 | 23 | ### Step 1: Clone the Repository 24 | 25 | Clone the repository to your local machine using the following command: 26 | 27 | ```bash 28 | git clone https://github.com/neomatrix369/learning-path-index.git learning-path-index 29 | 30 | cd learning-path-index 31 | ``` 32 | 33 | ### Step 2: Install Dependencies 34 | 35 | Python dependencies for each applet are located in the `./requirements/` directory, with each applet having a separate file. Here's how to install them using a virtual environment: 36 | 37 | #### Create a virtual environment: 38 | 39 | ```bash 40 | python -m venv venv 41 | ``` 42 | 43 | #### Activate the virtual environment 44 | 45 | MacOS/Linux 46 | 47 | ```bash 48 | . venv/bin/activate 49 | ``` 50 | 51 | Windows 52 | 53 | ```powershell 54 | venv\Scripts\activate 55 | ``` 56 | 57 | #### Install base dependencies 58 | 59 | ```bash 60 | pip install -r requirements/base.txt 61 | ``` 62 | 63 | #### Install dependencies for specific applets 64 | 65 | Each applet may have additional dependencies. Look for a requirements.txt file within the directory for the specific applet (e.g., requirements/scraper.txt) and install them using: 66 | 67 | ```bash 68 | pip install -r requirements/.txt 69 | ``` 70 | 71 | Replace `` with the actual name of the applet (e.g., `scraper`). 72 | 73 | ### Step 3: Setup pre-commit hooks 74 | 75 | Pre-commit hooks automate tasks like code formatting and linting. Install them using: 76 | 77 | ```bash 78 | pre-commit install 79 | ``` 80 | 81 | ### Step 4: Setup the Applets 82 | 83 | Each applet has its own setup and usage instructions. Refer to the documentation specific to each applet for detailed guidance on: 84 | 85 | - Web scraper: [Installation instructions and usage guide](app/course-scraper/README.md). 86 | 87 | - LLM Variant 01 (Ollama): [Instructions on setting up and using Ollama](app/llm-poc-variant-01/deploy/aws/README.md). 88 | 89 | - LLM Variant 02 (OpenAI): [Instructions on creating an OpenAI account and API keys](app/llm-poc-variant-02/README.md). 90 | 91 | **Tip**: Look for additional documentation files within the directory for each applet. 92 | 93 | ## Troubleshooting 94 | 95 | Here are some common issues or errors that you might face: 96 | 97 | - Dependency Conflicts: Ensure that your dependencies are up to date and consistent with the versions specified in the requirements directory. 98 | - OpenAI Rate limit error: The free tier of OpenAI has limitations on API calls. Consider upgrading to a paid account if you frequently encounter this error. 99 | 100 | For further assistance, refer to the project's documentation (if available) or reach out to the project maintainers on the GitHub repository. 101 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/requirements.txt: -------------------------------------------------------------------------------- 1 | aiofiles==23.2.1 2 | aiohappyeyeballs==2.4.3 3 | aiohttp==3.11.6 4 | aiosignal==1.3.1 5 | anyio==4.6.2.post1 6 | asgiref==3.8.1 7 | asyncer==0.0.7 8 | attrs==24.2.0 9 | backoff==2.2.1 10 | bcrypt==4.2.1 11 | bidict==0.23.1 12 | build==1.2.2.post1 13 | cachetools==5.5.0 14 | certifi==2024.8.30 15 | chainlit==1.1.404 16 | charset-normalizer==3.4.0 17 | chevron==0.14.0 18 | chroma-hnswlib==0.7.6 19 | chromadb==0.5.20 20 | click==8.1.7 21 | coloredlogs==15.0.1 22 | dataclasses-json==0.5.14 23 | Deprecated==1.2.15 24 | durationpy==0.9 25 | fastapi==0.110.3 26 | filelock==3.16.1 27 | filetype==1.2.0 28 | flatbuffers==24.3.25 29 | frozenlist==1.5.0 30 | fsspec==2024.10.0 31 | google-auth==2.36.0 32 | googleapis-common-protos==1.66.0 33 | greenlet==3.1.1 34 | grpcio==1.68.0 35 | h11==0.14.0 36 | httpcore==1.0.7 37 | httptools==0.6.4 38 | httpx==0.27.2 39 | huggingface-hub==0.26.2 40 | humanfriendly==10.0 41 | idna==3.10 42 | importlib_metadata==8.5.0 43 | importlib_resources==6.4.5 44 | Jinja2==3.1.4 45 | joblib==1.4.2 46 | kubernetes==31.0.0 47 | langchain==0.0.261 48 | langsmith==0.0.92 49 | Lazify==0.4.0 50 | literalai==0.0.607 51 | markdown-it-py==3.0.0 52 | MarkupSafe==3.0.2 53 | marshmallow==3.23.1 54 | mdurl==0.1.2 55 | mmh3==5.0.1 56 | monotonic==1.6 57 | mpmath==1.3.0 58 | multidict==6.1.0 59 | mypy-extensions==1.0.0 60 | nest-asyncio==1.6.0 61 | networkx==3.4.2 62 | numexpr==2.10.1 63 | numpy==1.26.4 64 | nvidia-cublas-cu12==12.4.5.8 65 | nvidia-cuda-cupti-cu12==12.4.127 66 | nvidia-cuda-nvrtc-cu12==12.4.127 67 | nvidia-cuda-runtime-cu12==12.4.127 68 | nvidia-cudnn-cu12==9.1.0.70 69 | nvidia-cufft-cu12==11.2.1.3 70 | nvidia-curand-cu12==10.3.5.147 71 | nvidia-cusolver-cu12==11.6.1.9 72 | nvidia-cusparse-cu12==12.3.1.170 73 | nvidia-nccl-cu12==2.21.5 74 | nvidia-nvjitlink-cu12==12.4.127 75 | nvidia-nvtx-cu12==12.4.127 76 | oauthlib==3.2.2 77 | onnxruntime==1.20.0 78 | openapi-schema-pydantic==1.2.4 79 | opentelemetry-api==1.28.2 80 | opentelemetry-exporter-otlp==1.28.2 81 | opentelemetry-exporter-otlp-proto-common==1.28.2 82 | opentelemetry-exporter-otlp-proto-grpc==1.28.2 83 | opentelemetry-exporter-otlp-proto-http==1.28.2 84 | opentelemetry-instrumentation==0.49b2 85 | opentelemetry-instrumentation-asgi==0.49b2 86 | opentelemetry-instrumentation-fastapi==0.49b2 87 | opentelemetry-proto==1.28.2 88 | opentelemetry-sdk==1.28.2 89 | opentelemetry-semantic-conventions==0.49b2 90 | opentelemetry-util-http==0.49b2 91 | orjson==3.10.11 92 | overrides==7.7.0 93 | packaging==23.2 94 | pillow==11.0.0 95 | posthog==3.7.2 96 | propcache==0.2.0 97 | protobuf==5.28.3 98 | pyasn1==0.6.1 99 | pyasn1_modules==0.4.1 100 | pydantic==1.10.19 101 | Pygments==2.18.0 102 | PyJWT==2.10.0 103 | PyPika==0.48.9 104 | pyproject_hooks==1.2.0 105 | python-dateutil==2.9.0.post0 106 | python-dotenv==1.0.1 107 | python-engineio==4.10.1 108 | python-multipart==0.0.9 109 | python-socketio==5.11.4 110 | PyYAML==6.0.2 111 | regex==2024.11.6 112 | requests==2.32.3 113 | requests-oauthlib==2.0.0 114 | rich==13.9.4 115 | rsa==4.9 116 | safetensors==0.4.5 117 | scikit-learn==1.5.2 118 | scipy==1.14.1 119 | sentence-transformers==3.3.1 120 | setuptools==75.6.0 121 | shellingham==1.5.4 122 | simple-websocket==1.1.0 123 | six==1.16.0 124 | sniffio==1.3.1 125 | SQLAlchemy==2.0.36 126 | starlette==0.37.2 127 | sympy==1.13.1 128 | syncer==2.0.3 129 | tenacity==8.5.0 130 | threadpoolctl==3.5.0 131 | tokenizers==0.20.3 132 | tomli==2.1.0 133 | torch==2.5.1 134 | tqdm==4.67.0 135 | transformers==4.46.3 136 | triton==3.1.0 137 | typer==0.13.1 138 | typing-inspect==0.9.0 139 | typing_extensions==4.12.2 140 | uptrace==1.28.2 141 | urllib3==2.2.3 142 | uvicorn==0.25.0 143 | uvloop==0.21.0 144 | watchfiles==0.20.0 145 | websocket-client==1.8.0 146 | websockets==14.1 147 | wrapt==1.16.0 148 | wsproto==1.2.0 149 | yarl==1.17.2 150 | zipp==3.21.0 151 | -------------------------------------------------------------------------------- /app/llm-gemma-variant/src/vector_db.py: -------------------------------------------------------------------------------- 1 | from llama_index.core import SimpleDirectoryReader 2 | from llama_index.readers.file import CSVReader 3 | import weaviate 4 | from loguru import logger 5 | import pandas as pd 6 | from llama_index.core import VectorStoreIndex, StorageContext 7 | from llama_index.vector_stores.weaviate import WeaviateVectorStore 8 | 9 | 10 | class VectorDB: 11 | """ 12 | Create a weaviate vector database from a the Learning Path Index csv file. 13 | """ 14 | def __init__( 15 | self, 16 | data_path: str, 17 | index_name: str, 18 | ): 19 | """ 20 | Initialize the VectorDB class. 21 | 22 | Args: 23 | data_path: str, path to the Learning Path Index csv file. 24 | index_name: str, name of the index to create. 25 | Output: 26 | None 27 | """ 28 | self.data_path = data_path 29 | self.index_name = index_name 30 | 31 | try: 32 | self.client = weaviate.connect_to_embedded() 33 | except weaviate.exceptions.WeaviateConnectionError as e: 34 | raise ConnectionError(f"Failed to connect to Weaviate: {str(e)}") 35 | 36 | def disconnect(self): 37 | """ 38 | Disconnect from the Weaviate vector database. 39 | """ 40 | if self.client: 41 | self.client.close() # Assuming the client has a close method 42 | logger.info("Disconnected from the Weaviate vector database.") 43 | else: 44 | logger.warning("No active connection to disconnect.") 45 | 46 | 47 | def LPI_loader(self): 48 | """ 49 | Load the Learning Path Index csv file. 50 | """ 51 | # Load data csv file 52 | df = pd.read_csv(self.data_path) 53 | 54 | # Use the CSVReader to load the data and load each row as a document by setting concat_rows=False 55 | parser = CSVReader(concat_rows=False) 56 | file_extractor = {".csv": parser} # Add other CSV formats as needed 57 | 58 | # Load the documents 59 | documents = SimpleDirectoryReader( 60 | input_files = [self.data_path], file_extractor=file_extractor 61 | ).load_data() 62 | 63 | logger.debug(documents[1]) 64 | 65 | 66 | # Adding Metadata to the documents 67 | for i, row in df.iterrows(): 68 | metadata = { 69 | 'source': row['Source'], 70 | 'course': row['Course_Learning_Material'], 71 | 'module': row['Module'] 72 | } 73 | documents[i + 1].metadata = metadata 74 | 75 | return documents 76 | 77 | def vector_db_creation(self): 78 | """ 79 | Create a weaviate vector database from the Learning Path Index csv file. 80 | """ 81 | documents = self.LPI_loader() 82 | 83 | 84 | logger.info(f"Connected to the weaviate embedded instance: {self.client.is_ready()}") 85 | 86 | # Create the vector database 87 | vector_store = WeaviateVectorStore( 88 | weaviate_client = self.client, 89 | index_name = self.index_name 90 | ) 91 | 92 | # Set up the storage for the embeddings 93 | storage_context = StorageContext.from_defaults(vector_store=vector_store) 94 | # Setup the index 95 | # build VectorStoreIndex that takes care of chunking documents 96 | # and encoding chunks to embeddings for future retrieval 97 | 98 | logger.info(f"Creating the {self.index_name} index") 99 | index = VectorStoreIndex.from_documents( 100 | documents, storage_context=storage_context 101 | ) 102 | logger.info(f"The {self.index_name} index has been created") 103 | return index 104 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .chainlit/ 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | .idea/ 162 | 163 | # Security 164 | *.pem 165 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/aws/README.md: -------------------------------------------------------------------------------- 1 | # Deployment: Infrastructure as code 2 | 3 | The scripts in this folder give the ability to provision and manage compute capacity using [AWS Infrastructure]([link to follow]), in order to deploy the docker container and run the app in it. 4 | 5 | In short the scripts does the below: 6 | - [instructions to follow] 7 | 8 | **Table of content** 9 | - [Pre-requisites](#pre-requisites) 10 | - [Provisioning Infrastructure using Terraform](#provisioning-infrastructure-using-terraform) 11 | + [Create infrastructure from the CLI using Terraform](#create-infrastructure-from-the-cli-using-terraform) 12 | + [Deploy the docker image with the notebooks and libraries](#deploy-the-docker-image-with-the-notebooks-and-libraries) 13 | + [Destroy infrastructure (cleanup)](#destroy-infrastructure-cleanup) 14 | - [Security](#security) 15 | 16 | ## Pre-requisites 17 | 18 | - [AWS & Relates stuff] 19 | - [Install Terraform](https://learn.hashicorp.com/terraform/getting-started/install.html) (all methods for the various platforms are mentioned) 20 | - Clone this repo and in the right folder: 21 | ```bash 22 | $ git clone https://github.com/neomatrix369/learning-path-index/ 23 | $ cd learning-path-index 24 | $ cd app/llm-poc-variant-01/deploy/aws 25 | ``` 26 | 27 | For a summary (also helps to verify the steps) of the above steps please see [here](https://www.terraform.io/docs/providers/aws/index.html). 28 | 29 | ## Provisioning Infrastructure using Terraform 30 | 31 | ### Create infrastructure from the CLI using Terraform 32 | 33 | - Deploy with terraform 34 | 35 | ```bash 36 | $ terraform init 37 | $ terraform apply -var "ssh_private_key=$(cat )" --auto-approve 38 | ``` 39 | 40 | The deployment process should end with a list of private/public ip addresses like so: 41 | 42 | ```bash 43 | Apply complete! Resources: 9 added, 0 changed, 0 destroyed. 44 | 45 | Outputs: 46 | 47 | instance_private_ips = [ 48 | 10.1.nn.m 49 | ] 50 | instance_public_ips = [ 51 | 1xx.145.174.85 52 | ] 53 | 54 | ``` 55 | 56 | The public IP addresses are fairly dynamic in nature and could be between any range (example shown above). Please make a note of the Public IP above as it will be needed in the following steps. 57 | 58 | ### Deploy the docker image with the notebooks and libraries 59 | 60 | - use ssh and docker to make that end meet 61 | 62 | ```bash 63 | $ ./run-docker-container.sh 64 | ``` 65 | 66 | ### Recover/retry from failed attempt 67 | 68 | - Apply the fix to the configuration or script or both 69 | - And run the below again: 70 | 71 | ```bash 72 | $ terraform apply -var "ssh_private_key=$(cat )" --auto-approve 73 | ``` 74 | 75 | ### Start clean after a failed attempt (errors encountered) 76 | 77 | - Run the below before proceeding: 78 | 79 | ```bash 80 | $ terraform destroy -var "ssh_private_key=$(cat )" --auto-approve 81 | $ terraform apply -var "ssh_private_key=$(cat )" --auto-approve 82 | ``` 83 | 84 | 85 | ### Destroy infrastructure (cleanup) 86 | 87 | - Remove resources or destroy them with terraform 88 | 89 | ```bash 90 | $ terraform destroy -var "ssh_private_key=$(cat )" --auto-approve 91 | ``` 92 | 93 | You should see something like this at the end of a successful run: 94 | 95 | ```text 96 | . 97 | . 98 | . 99 | Destroy complete! Resources: 7 destroyed. 100 | ``` 101 | 102 | ### Security 103 | 104 | Note that this setup does not take into account establishing a secure `http` i.e. `https` communication between the Jupyter lab instance and the browser. Please beware when using this in your target domain depending on the prerequisites you need to conform to. This example is good for learning and illustration purposes, please do NOT deploy it in production or public facing environments. 105 | 106 | --- 107 | 108 | Go to [Main page](../../README.md) -------------------------------------------------------------------------------- /app/llm-poc-variant-01/deploy/gcp/README.md: -------------------------------------------------------------------------------- 1 | # Deployment: Infrastructure as Code 2 | 3 | The scripts in this folder give the ability to provision and manage compute capacity using [Google Cloud Platform](https://cloud.google.com/), in order to deploy the LLM Application and provision the Chainlit app. 4 | 5 | In short, the scripts do the following: 6 | - Create compute instances and associated network resources necessary to run an instance on GCP 7 | - Create the necessary firewall configurations to allow services to communicate publicly over HTTP and HTTPS 8 | - Set up an [Ollama](https://github.com/ollama/ollama) service on the Compute instance, and start it 9 | - Finally, start the Chainlit app, and expose it on port 8000. The interface will be accessible on `:8000` 10 | 11 | ![Preview of the Chainlit app](chainlit-app-demo.gif "Preview of the Chainlit app") 12 | 13 | **Table of Contents** 14 | - [Pre-requisites](#pre-requisites) 15 | - [Provisioning Infrastructure using Terraform](#provisioning-infrastructure-using-terraform) 16 | - [Create a new project on Google Console](#create-a-new-project-on-google-console) 17 | - [Authenticate Terraform with GCloud credentials](#authenticate-terraform-with-gcloud-credentials) 18 | - [Create Cloud Bucket to store Terraform state](#create-cloud-bucket-to-store-terraform-state) 19 | - [Deploy with Terraform](#deploy-with-terraform) 20 | - [Destroy infrastructure (cleanup)](#destroy-infrastructure-cleanup) 21 | - [Security](#security) 22 | 23 | ## Pre-requisites 24 | 25 | - A Google Console account with some credits. [If it's a new GCP account, you might get access to free $300 credits](https://cloud.google.com/free?hl=en) 26 | - Install GCloud CLI. See [the official GCloud installation guide](https://cloud.google.com/sdk/docs/install). 27 | - [Install Terraform](https://learn.hashicorp.com/terraform/getting-started/install.html) (all operating systems are supported) 28 | - Clone this repository 29 | 30 | ```bash 31 | git clone https://github.com/neomatrix369/learning-path-index/ 32 | cd learning-path-index 33 | cd app/llm-poc-variant-01/deploy/gcp 34 | ``` 35 | 36 | For a summary (also helps to verify the steps) of the above steps please see [here](https://registry.terraform.io/providers/hashicorp/google/latest/docs). 37 | 38 | ### Quick Terraform install script for Linux 39 | 40 | ```bash 41 | curl -sSL https://releases.hashicorp.com/terraform/1.9.8/terraform_1.9.8_linux_386.zip -o ~/terraform_1.9.8_linux_386.zip 42 | 43 | unzip -q ~/terraform_1.9.8_linux_386.zip -d /tmp/terraform_1.9.8_linux_386 44 | mv /tmp/terraform_1.9.8_linux_386/terraform /usr/local/bin/terraform 45 | 46 | rm -rf /tmp/terraform_1.9.8_linux_386/ 47 | ``` 48 | 49 | 50 | 51 | ## Provisioning Infrastructure using Terraform 52 | - #### Create a new project on Google Console. 53 | In your terminal, in the LPI repository folder, run: 54 | ```bash 55 | gcloud config set project 56 | ``` 57 | 58 | - #### Authenticate Terraform with GCloud credentials 59 | This workflow assumes you are working on a personal computer/workstation. For CI/CD pipelines, [other authentication steps are recommended](https://cloud.google.com/docs/terraform) 60 | ```bash 61 | gcloud auth application-default login 62 | ``` 63 | 64 | - #### Create Cloud Bucket to store Terraform state 65 | ```bash 66 | gsutil mb -l europe-west1 gs://llm-project-sbx-tf-state 67 | 68 | gsutil versioning set on gs://llm-project-sbx-tf-state 69 | ``` 70 | 71 | Substitute `europe-west1` for [any other region of your choice](https://cloud.google.com/compute/docs/regions-zones). 72 | 73 | - #### Deploy with terraform 74 | 75 | ```bash 76 | terraform init 77 | 78 | terraform workspace new llm-project 79 | 80 | terraform apply --auto-approve 81 | ``` 82 | 83 | The deployment process should end with a list of private/public ip addresses like so: 84 | 85 | ```bash 86 | Apply complete! Resources: 1 added, 0 changed, 0 destroyed. 87 | 88 | Outputs: 89 | 90 | network_interface_0_access_config_0_nat_ip = "" 91 | network_interface_0_network_ip = "" 92 | self_link = "" 93 | tags = toset([ 94 | "http-server", 95 | "https-server", 96 | "lpi-sg", 97 | ]) 98 | ``` 99 | 100 | The public IP addresses are fairly dynamic in nature and could be between any range (example shown above). Please make a note of the Public IP above as it will be needed in the following steps. 101 | 102 | - #### SSH into the Compute Instance 103 | The compute instance can be accessed over SSH viz: 104 | ```bash 105 | gcloud compute ssh --project= --zone= lpi-cpu-vm 106 | ``` 107 | 108 | ### Destroy infrastructure (cleanup) 109 | 110 | - Remove resources or destroy them with terraform 111 | 112 | ```bash 113 | $ terraform destroy --var-file=project.tfvars --auto-approve 114 | ``` 115 | 116 | You should see something like this at the end of a successful run: 117 | 118 | ```text 119 | . 120 | . 121 | . 122 | Destroy complete! Resources: 7 destroyed. 123 | ``` 124 | 125 | ### Security 126 | 127 | Note that this setup does not take into account establishing a secure `http` i.e. `https` communication between the Chainlit instance and the browser, nor does it place emphasis on creating a fool-proof firewall for the compute instance. Please beware when using this in your target domain depending on the prerequisites you need to conform to. This example is good for learning and illustration purposes, please do NOT deploy it in production or public facing environments. 128 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/lpiGPT.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | import time 5 | from datetime import datetime 6 | 7 | import torch 8 | from constants import CHROMA_SETTINGS 9 | from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 10 | from langchain.chains import RetrievalQA 11 | from langchain.embeddings import HuggingFaceEmbeddings 12 | from langchain.llms import Ollama 13 | from langchain.prompts import PromptTemplate 14 | from langchain.vectorstores import Chroma 15 | from langchain.vectorstores.base import VectorStoreRetriever 16 | 17 | OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://localhost:11434') 18 | 19 | 20 | def build_retriever( 21 | model_embeddings: str, 22 | persist_directory: str, 23 | target_source_chunks: int = 500, 24 | ) -> VectorStoreRetriever: 25 | embeddings = HuggingFaceEmbeddings(model_name=model_embeddings) 26 | vector_db = Chroma( 27 | persist_directory, 28 | embedding_function=embeddings, 29 | client_settings=CHROMA_SETTINGS, 30 | ) 31 | return vector_db.as_retriever(search_kwargs={'k': target_source_chunks}) 32 | 33 | 34 | def build_prompt(): 35 | """ 36 | Reference/Guide: 37 | - https://smith.langchain.com/hub/rlm/rag-prompt-mistral 38 | - https://smith.langchain.com/hub/rlm/rag-prompt-llama 39 | """ 40 | prompt_template = """ 41 | [INST] 42 | <> You are an assistant for question-answering tasks using the Learning Path Index. 43 | Show the results in a table or tabular form, and the results must contain a link for each line of the courses, modules or sub-modules returned. 44 | <> 45 | Context: {context} 46 | Question: {question} 47 | Answer: [/INST] 48 | """ 49 | return PromptTemplate( 50 | template=prompt_template, input_variables=['context', 'question'] 51 | ) 52 | 53 | 54 | def build_model( 55 | retriever: VectorStoreRetriever, 56 | model_name: str = 'gemma:2b', 57 | mute_stream: bool = False, 58 | ): 59 | IS_GPU_AVAILABLE = torch.cuda.is_available() 60 | ( 61 | print( 62 | f'~~~ GPU is available (CUDA-DNN Enabled: {torch.backends.cudnn.enabled}) ~~~' 63 | ) 64 | if IS_GPU_AVAILABLE 65 | else print('~~~ GPU is NOT available, falling back to CPU ~~~') 66 | ) 67 | start = time.time() 68 | 69 | # activate/deactivate the streaming StdOut callback for LLMs 70 | callbacks = [] if mute_stream else [StreamingStdOutCallbackHandler()] 71 | llm = Ollama(model=model_name, callbacks=callbacks, base_url=OLLAMA_HOST) 72 | qa = RetrievalQA.from_chain_type( 73 | llm=llm, 74 | chain_type='stuff', 75 | retriever=retriever, 76 | return_source_documents=True, 77 | chain_type_kwargs={'prompt': build_prompt()}, 78 | ) 79 | 80 | end = time.time() 81 | 82 | print(f'Models took about {end - start} seconds to load.') 83 | return qa, llm 84 | 85 | 86 | def parse_arguments(): 87 | parser = argparse.ArgumentParser( 88 | description='lpiGPT: Ask questions to your documents without an internet connection, ' 89 | 'using the power of LLMs (the InstructGPT or Chat model).' 90 | ) 91 | # https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard or https://ollama.ai/library 92 | parser.add_argument( 93 | '--chat-model', 94 | '-CM', 95 | action='store', 96 | default='gemma:2b', 97 | help='Use this flag to set the InstructGPT or Chat model name, see https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard or https://ollama.ai/library for more names.', 98 | ) 99 | # For embeddings model, the example uses a sentence-transformers model 100 | # https://www.sbert.net/docs/pretrained_models.html 101 | # "The all-mpnet-base-v2 model provides the best quality, while all-MiniLM-L6-v2 is 5 times faster 102 | # and still offers good quality." 103 | parser.add_argument( 104 | '--embeddings-model-name', 105 | '-EM', 106 | action='store', 107 | default='all-MiniLM-L6-v2', 108 | help='Use this flag to set the Embeddings model name, see https://www.sbert.net/docs/pretrained_models.html for examples of names. Use the same model as used for ingesting the documents (ingest.py)', 109 | ) 110 | 111 | parser.add_argument( 112 | '--persist-directory', 113 | '-P', 114 | action='store', 115 | default='vector_db', 116 | help='Use this flag to specify the name of the vector database, this will be a folder on the local machine.', 117 | ) 118 | 119 | parser.add_argument( 120 | '--target-source-chunks', 121 | '-C', 122 | action='store', 123 | default=500, 124 | help='Use this flag to specify the name chunk size to use to chunk source data.', 125 | ) 126 | 127 | parser.add_argument( 128 | '--hide-source', 129 | '-S', 130 | action='store_true', 131 | help='Use this flag to disable printing of source documents used for answers.', 132 | ) 133 | 134 | parser.add_argument( 135 | '--mute-stream', 136 | '-M', 137 | action='store_true', 138 | help='Use this flag to disable the streaming StdOut callback for LLMs.', 139 | ) 140 | 141 | return parser.parse_args() 142 | 143 | 144 | def main(): 145 | args = parse_arguments() 146 | retriever = build_retriever(args.embeddings_model_name, args.persist_directory) 147 | qa, _llm = build_model( 148 | retriever, 149 | model_name=args.chat_model, 150 | mute_stream=args.mute_stream, 151 | ) 152 | # Interactive questions and answers 153 | while True: 154 | query = input('\nEnter a query: ') 155 | if query == 'exit': 156 | break 157 | if query.strip() == '': 158 | continue 159 | 160 | # Get the answer from the chain 161 | start = time.time() 162 | print( 163 | f"\nStart time: {datetime.utcfromtimestamp(start).strftime('%Y-%m-%d %H:%M:%S')}" 164 | ) 165 | answer = qa({'query': query}) 166 | answer, docs = ( 167 | answer['result'], 168 | answer.get('source_documents', []), 169 | ) 170 | end = time.time() 171 | 172 | # Print the result 173 | print('\n\n> Question:') 174 | print(query) 175 | print( 176 | f"\nEnd time: {datetime.utcfromtimestamp(end).strftime('%Y-%m-%d %H:%M:%S')}" 177 | ) 178 | print(f'\nAnswer (took about {end - start} seconds):') 179 | print(answer) 180 | 181 | # Print the relevant sources used for the answer 182 | if not args.hide_source: 183 | for document in docs: 184 | print('\n> ' + document.metadata['source'] + ':') 185 | print(document.page_content) 186 | 187 | 188 | if __name__ == '__main__': 189 | main() 190 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning Path Index 2 | A repo with data files, assets and code supporting and powering the Learning Path Index. 3 | 4 | Table of content 5 | 6 | - [Overview](#overview) 7 | - [Key Features](#key-features) 8 | - [Potential Innovations](#potential-innovations) 9 | - [How to contribute to this initiative?](#how-to-contribute-to-this-initiative) 10 | - [Important Links](#important-links) 11 | - [Credits](#credits) 12 | 13 | ## Overview 14 | 15 | The **Learning Path Index** is a dynamic and versatile repository designed to empower learners in the fields of Data Science and Machine Learning. It offers a curated collection of byte-sized courses and learning materials, meticulously organized and tagged to facilitate effortless discovery. Whether you're a novice or a seasoned practitioner, the Learning Path Index is your gateway to knowledge, tailored to your interests and needs. 16 | 17 | The outcome of this effort was the creation of this _Git repo_ and the KaggleX [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset) by the mentors and mentees of **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between August 2023 and November 2023), see the [Credits](#credits) section for more details. 18 | 19 | 20 | ## Key Features 21 | 22 | ### 1. Comprehensive Collection 23 | - A vast array of byte-sized courses and learning materials covering Data Science and Machine Learning topics. 24 | - Courses are categorized and tagged by keywords, categories, topics, and interests, all closely aligned with the world of Data Science and Machine Learning. 25 | 26 | ### 2. Robust Search and Filtering 27 | - Effortless search and filtering capabilities allow you to find the content you need quickly. 28 | - Search by full or partial text, including keywords, categories, topics, and interests. 29 | 30 | ### 3. Collaborative Contribution 31 | - Easy-to-use mechanisms for adding new courses and enhancing existing entries. 32 | - Contribute your expertise and help refine course definitions for the benefit of the entire community. 33 | 34 | ### [4. Automated Data/Course Scraping *(WIP)*:](./app/course-scraper) 35 | - Automatically scrape course information and details from multiple platforms. 36 | - Data enrichment and augmentation using AI! 37 | 38 | ### [5. Keyword Extraction with KeyBERT and WordWise-Kaggle Notebook](https://github.com/neomatrix369/learning-path-index/blob/main/app/Keyword%20Extraction%20with%20KeyBERT%20and%20WordWise.ipynb) 39 | 40 | ### [6. Learning Pathway Index Data Cleaning and Preprocessing](https://www.kaggle.com/code/manishkr1754/lpi-data-cleaning-and-preprocessing/notebook) 41 | 42 | ### [7. Contextual Search On Kaggle Learning Path Index](https://github.com/mbhoge/learning-path-index-rag-search/blob/learning-path-index-search-01/app/llm-poc-variant-02/README.md) 43 | 44 | ## Getting Started and Setup 45 | Please refer to the getting started guide, [Getting Started](getting-started.md), for setup instructions. 46 | 47 | ## Potential Innovations 48 | 49 | Explore exciting possibilities for enhancing the Learning Path Index: 50 | 51 | 1. **Course Chunking**: Divide pending courses into byte-sized modules for a more digestible learning experience. 52 | 53 | 2. **Content Enrichment**: Assist in fine-tuning, correcting, and enriching existing byte-sized entries to ensure high-quality learning materials. 54 | 55 | 3. **Kaggle Dataset**: Transform the Learning Path Index into a dataset and host it on Kaggle Datasets for broader accessibility. 56 | 57 | 4. **Keyword Extraction**: Automatically extract keywords from course websites and byte-sized modules to enhance search functionality. 58 | 59 | 5. **Exploratory Data Analysis (EDA)**: Conduct exploratory data analysis on course materials to gain valuable insights into the content of the datasets. 60 | 61 | 6. **NLP Profiler**: Implement NLP Profiler and Pandas Profiler to analyze courses by various parameters, uncovering hidden patterns. 62 | 63 | 7. **Interactive Learning**: Develop a Streamlit, Shiny, or Mercury app to make these courses available online, fostering an interactive learning environment. 64 | 65 | 8. **Cloud Hosting**: Deploy the app on popular cloud platforms like Heroku, Netlify, AWS, or others for widespread accessibility. 66 | 67 | 9. **Notebook Integration**: Create Google Colab, Kaggle Notebook, Amazon Notebook, or Interactive Jupyter Notebook integrations to facilitate seamless course exploration. 68 | 69 | 10. **NLP Enhancement**: Apply advanced NLP techniques to the existing data to extract deeper linguistic value and meaning. 70 | 71 | 11. **Generative AI**: Utilize the dataset to build Language Model (LLM) and Generative AI models, opening doors to innovative AI-related activities. 72 | 73 | 12. **Continuous Improvement**: Brainstorm and implement additional ideas to enhance the tool's utility for both the community and individuals. 74 | 75 | Join us in this exciting journey of learning, collaboration, and innovation. Together, we can create a valuable resource for the Data Science and Machine Learning community. Let's embark on the path to knowledge and discovery! 76 | 77 | 78 | ## How to contribute to this initiative? 79 | 80 | - You can also join us by taking part in the next [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607)) 81 | - Keep your eyes open on the **Kaggle Discussions** page and other **KaggleX** social media channels. Or find us on the [Kaggle Discord](https://www.kaggle.com/discussions/general/429933) channel to learn more about the next steps 82 | - Create notebooks from this data 83 | - Create supplementary or complementary data for or from this dataset 84 | - Submit corrections/enhancements or anything else to help improve this dataset so it has a wider use and purpose 85 | 86 | ## Important Links 87 | 88 | - [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607)) 89 | - KaggleX [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset) 90 | - KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) 91 | - [New Official Kaggle Discord Server!](https://www.kaggle.com/discussions/general/429933) 92 | 93 | ## Credits 94 | 95 | Credits for all the work done to create this Git Repo and the KaggleX [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset/data) goes to these mentors and mentees (in no particular order): [Manish Kumar](https://www.kaggle.com/manishkr1754), [Ben Aji](https://www.kaggle.com/benajii) (_mentor_), [Emmanuel Katchy](https://www.kaggle.com/tobetek), [Ezeogu Ernest](https://www.kaggle.com/tobetek), [Manish](https://www.kaggle.com/manish5), [Mustafa](https://www.kaggle.com/mustafa254), [Nnamdi Idowu-Anifowoshe](https://www.kaggle.com/idowuchukwudi), [Sheba Alkali](https://www.kaggle.com/shebaalkali), [Zainab ikeoluwa](https://www.kaggle.com/zainabikeoluwa), [Wendy Mak](https://www.kaggle.com/wwymak) (_mentor_), [Misirya Hameed](https://www.linkedin.com/in/misiriya-shahul-hameed-b3957875) (_mentor_), [Chukwuebuka Obi](https://www.kaggle.com/chukwuebukaobi), [Victor Umunna](https://www.kaggle.com/victorumunna), [Pui Yueng](https://www.kaggle.com/lorentzyeung), [Afolake Solomon](https://www.kaggle.com/flakkyddon), [Faith Osoro](https://www.kaggle.com/faithosoro), [Chukwudi Idowu](https://www.kaggle.com/chukwudiidowu) and many others who were part of the **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between August 2023 and November 2023). 96 | 97 | Our gratitude also goes to our silent supporters of this initiative from organisers to the mentors and mentees whose help and support kept us going. 98 | 99 | _**Note:** In case your name or mention is missed out in the above list, then please let us know._ 100 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/ingest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import glob 4 | import os 5 | import time 6 | from multiprocessing import Pool 7 | from typing import List 8 | 9 | from constants import CHROMA_SETTINGS 10 | from langchain.docstore.document import Document 11 | from langchain.document_loaders import CSVLoader 12 | from langchain.embeddings import HuggingFaceEmbeddings 13 | from langchain.text_splitter import RecursiveCharacterTextSplitter 14 | from langchain.vectorstores import Chroma 15 | from tqdm import tqdm 16 | 17 | # Map file extensions to document loaders and their arguments 18 | LOADER_MAPPING = { 19 | '.csv': (CSVLoader, {}), 20 | # Add more mappings for other file extensions and loaders as needed 21 | } 22 | 23 | 24 | def load_single_document(file_path: str) -> List[Document]: 25 | ext = '.' + file_path.rsplit('.', 1)[-1] 26 | if ext in LOADER_MAPPING: 27 | loader_class, loader_args = LOADER_MAPPING[ext] 28 | loader = loader_class(file_path, **loader_args) 29 | return loader.load() 30 | 31 | raise ValueError(f"Unsupported file extension '{ext}'") 32 | 33 | 34 | def load_documents(source_dir: str, ignored_files: List[str] = None) -> List[Document]: 35 | """ 36 | Loads all documents from the source documents directory, ignoring specified files 37 | """ 38 | if not ignored_files: 39 | ignored_files = [] 40 | all_files = [] 41 | for ext in LOADER_MAPPING: 42 | all_files.extend( 43 | glob.glob(os.path.join(source_dir, f'**/*{ext}'), recursive=True) 44 | ) 45 | filtered_files = [ 46 | file_path for file_path in all_files if file_path not in ignored_files 47 | ] 48 | 49 | with Pool(processes=os.cpu_count()) as pool: 50 | results = [] 51 | with tqdm( 52 | total=len(filtered_files), desc='Loading new documents', ncols=80 53 | ) as pbar: 54 | for _, docs in enumerate( 55 | pool.imap_unordered(load_single_document, filtered_files) 56 | ): 57 | results.extend(docs) 58 | pbar.update() 59 | 60 | return results 61 | 62 | 63 | def process_documents( 64 | source_documents: str, 65 | chunk_size: int, 66 | chunk_overlap: int, 67 | ignored_files: List[str] = None, 68 | ) -> List[Document]: 69 | """ 70 | Load documents and split in chunks 71 | """ 72 | if not ignored_files: 73 | ignored_files = [] 74 | start_time = time.time() 75 | print(f'Loading documents from {source_documents}') 76 | 77 | documents = load_documents(source_documents, ignored_files) 78 | if not documents: 79 | print('No new documents to load') 80 | exit(0) 81 | print(f'Loaded {len(documents)} new documents from {source_documents}') 82 | 83 | text_splitter = RecursiveCharacterTextSplitter( 84 | chunk_size=chunk_size, chunk_overlap=chunk_overlap 85 | ) 86 | texts = text_splitter.split_documents(documents) 87 | print(f'Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)') 88 | end_time = time.time() 89 | print(f'Loading documents took about {end_time - start_time} seconds to complete.') 90 | return texts 91 | 92 | 93 | def does_vectorstore_exist(persist_directory: str) -> bool: 94 | """ 95 | Checks if vectorstore exists 96 | """ 97 | if os.path.exists(os.path.join(persist_directory, 'index')) and ( 98 | os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) 99 | and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')) 100 | ): 101 | list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin')) 102 | list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl')) 103 | # At least 3 documents are needed in a working vectorstore 104 | if len(list_index_files) > 3: 105 | return True 106 | return False 107 | 108 | 109 | def parse_arguments(): 110 | parser = argparse.ArgumentParser( 111 | description='ingest: process one or more documents (text) in order to create embeddings (using the Embeddings models)' 112 | 'from them, and make them ready to be used with LLMs when a question is asked to the InstructGPT or Chat Model.' 113 | ) 114 | # For embeddings model, the example uses a sentence-transformers model 115 | # https://www.sbert.net/docs/pretrained_models.html 116 | # "The all-mpnet-base-v2 model provides the best quality, while all-MiniLM-L6-v2 is 5 times faster 117 | # and still offers good quality." 118 | parser.add_argument( 119 | '--embeddings-model-name', 120 | '-EM', 121 | action='store', 122 | default='all-MiniLM-L6-v2', 123 | help='Use this flag to set the Embeddings model name, see https://www.sbert.net/docs/pretrained_models.html for examples of names. Use the same model when running the lpiGPT.py app.', 124 | ) 125 | 126 | parser.add_argument( 127 | '--source-documents', 128 | '-S', 129 | action='store', 130 | default='source_documents', 131 | help='Use this flag to specify the name of the folder where all the (source/input) documents are stored for ingestion purposes, on the local machine. The documents contained in them are of the type `.csv`.', 132 | ) 133 | 134 | parser.add_argument( 135 | '--persist-directory', 136 | '-P', 137 | action='store', 138 | default='vector_db', 139 | help='Use this flag to specify the name of the vector database, this will be a folder on the local machine.', 140 | ) 141 | 142 | parser.add_argument( 143 | '--target-source-chunks', 144 | '-C', 145 | action='store', 146 | default=500, 147 | help='Use this flag to specify the name chunk size to use to chunk source data.', 148 | ) 149 | 150 | parser.add_argument( 151 | '--chunk-overlap', 152 | '-O', 153 | action='store', 154 | default=50, 155 | help='Use this flag to specify the name chunk overlap value to use to chunk source data.', 156 | ) 157 | 158 | return parser.parse_args() 159 | 160 | 161 | def main(): 162 | args = parse_arguments() 163 | 164 | start_time = time.time() 165 | # Create embeddings 166 | print('\nCreating/downloading HF embeddings started...') 167 | embeddings = HuggingFaceEmbeddings(model_name=args.embeddings_model_name) 168 | end_time = time.time() 169 | print( 170 | f'Creating/downloading HF embeddings completed! It took about {end_time - start_time} seconds to complete.' 171 | ) 172 | 173 | start_time = time.time() 174 | print('\nStarted with ingestion process, to create vector database...') 175 | if does_vectorstore_exist(args.persist_directory): 176 | # Update and store locally vectorstore 177 | print(f'-- Appending to existing vectorstore at {args.persist_directory}') 178 | vector_db = Chroma( 179 | persist_directory=args.persist_directory, 180 | embedding_function=embeddings, 181 | client_settings=CHROMA_SETTINGS, 182 | ) 183 | collection = vector_db.get() 184 | texts = process_documents( 185 | args.source_documents, 186 | args.target_source_chunks, 187 | args.chunk_overlap, 188 | [metadata['source'] for metadata in collection['metadatas']], 189 | ) 190 | print('-- Creating embeddings. May take some minutes...') 191 | vector_db.add_documents(texts) 192 | else: 193 | # Create and store locally vectorstore 194 | print('-- Creating new vectorstore') 195 | texts = process_documents( 196 | args.source_documents, args.target_source_chunks, args.chunk_overlap 197 | ) 198 | print('-- Creating embeddings. May take some minutes...') 199 | vector_db = Chroma.from_documents( 200 | texts, 201 | embeddings, 202 | persist_directory=args.persist_directory, 203 | client_settings=CHROMA_SETTINGS, 204 | ) 205 | vector_db.persist() 206 | vector_db = None 207 | end_time = time.time() 208 | 209 | print( 210 | f'Ingestion complete! It took about {end_time - start_time} seconds to complete.' 211 | ) 212 | print('\nYou can now run lpiGPT.py to query your documents') 213 | 214 | 215 | if __name__ == '__main__': 216 | main() 217 | -------------------------------------------------------------------------------- /app/llm-poc-variant-02/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | from datetime import datetime 4 | import time 5 | from langchain.llms import OpenAI 6 | from langchain.document_loaders import TextLoader 7 | from langchain.text_splitter import CharacterTextSplitter 8 | from langchain.embeddings.openai import OpenAIEmbeddings 9 | from langchain.chains import RetrievalQA 10 | from langchain.prompts import PromptTemplate 11 | from langchain.llms import OpenAI 12 | from langchain.vectorstores import FAISS 13 | from langchain.prompts import PromptTemplate 14 | 15 | from interface import app 16 | import streamlit as st 17 | # Define GenerateLearningPathIndexEmbeddings class: 18 | # - Load .csv file 19 | # - Chunk text 20 | # - Chunk size = 1000 characters 21 | # - Chunk overlap = 30 characters 22 | # - Create FAISS vector store from chunked text and OpenAI embeddings 23 | # - Get FAISS vector store 24 | # This class is used to generate the FAISS vector store from the .csv file. 25 | class GenerateLearningPathIndexEmbeddings: 26 | def __init__(self, csv_filename): 27 | load_dotenv() # Load .env file 28 | self.openai_api_key = os.getenv("OPENAI_API_KEY") 29 | # load the csv file from the data folder above 2 folders 30 | self.data_path = os.path.join('..\..\data', csv_filename) 31 | self.our_custom_data = None 32 | self.openai_embeddings = None 33 | self.faiss_vectorstore = None 34 | 35 | self.load_csv_data() 36 | self.get_openai_embeddings() 37 | self.create_faiss_vectorstore_with_csv_data_and_openai_embeddings() 38 | 39 | def load_csv_data(self): 40 | # Load your dataset (e.g., CSV, JSON, etc.) 41 | print(' -- Started loading .csv file for chunking purposes.') 42 | loader = TextLoader(self.data_path) 43 | document = loader.load() 44 | text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=30, separator="\n") 45 | self.our_custom_data = text_splitter.split_documents(document) 46 | print(f' -- Finished spitting (i.e. chunking) text (i.e. documents) from the .csv file (i.e. {self.data_path}).') 47 | 48 | def get_openai_embeddings(self): 49 | self.openai_embeddings = OpenAIEmbeddings(openai_api_key=self.openai_api_key, request_timeout=60) 50 | 51 | def create_faiss_vectorstore_with_csv_data_and_openai_embeddings(self): 52 | faiss_vectorstore_foldername = "faiss_learning_path_index" 53 | if not os.path.exists(faiss_vectorstore_foldername): 54 | print(' -- Creating a new FAISS vector store from chunked text and OpenAI embeddings.') 55 | vectorstore = FAISS.from_documents(self.our_custom_data, self.openai_embeddings) 56 | vectorstore.save_local(faiss_vectorstore_foldername) 57 | print(f' -- Saved the newly created FAISS vector store at "{faiss_vectorstore_foldername}".') 58 | else: 59 | print(f' -- WARNING: Found existing FAISS vector store at "{faiss_vectorstore_foldername}", loading from cache.') 60 | print(f' -- NOTE: Delete the FAISS vector store at "{faiss_vectorstore_foldername}", if you wish to regenerate it from scratch for the next run.') 61 | self.faiss_vectorstore = FAISS.load_local( 62 | "faiss_learning_path_index", self.openai_embeddings 63 | ) 64 | 65 | def get_faiss_vector_store(self): 66 | return self.faiss_vectorstore 67 | 68 | 69 | # https://discuss.streamlit.io/t/how-to-check-if-code-is-run-inside-streamlit-and-not-e-g-ipython/23439/7 70 | def running_inside_streamlit(): 71 | """ 72 | Function to check whether python code is run within streamlit 73 | 74 | Returns 75 | ------- 76 | use_streamlit : boolean 77 | True if code is run within streamlit, else False 78 | """ 79 | try: 80 | from streamlit.runtime.scriptrunner import get_script_run_ctx 81 | if not get_script_run_ctx(): 82 | use_streamlit = False 83 | else: 84 | use_streamlit = True 85 | except ModuleNotFoundError: 86 | use_streamlit = False 87 | return use_streamlit 88 | 89 | 90 | # Define GenAI class: 91 | # - Create prompt template 92 | # - Create GenAI project 93 | # - Get response for query 94 | # This class is used to get the response for a query from the GenAI project. 95 | # The GenAI project is created from the FAISS vector store. 96 | class GenAILearningPathIndex: 97 | def __init__(self, faiss_vectorstore): 98 | load_dotenv() # Load .env file 99 | self.openai_api_key = os.getenv("OPENAI_API_KEY") 100 | self.faiss_vectorstore = faiss_vectorstore 101 | 102 | prompt_template = \ 103 | """ 104 | Use the following template to answer the question at the end, 105 | from the Learning Path Index csv file, 106 | display top 10 results in a tablular format and it 107 | should look like this: 108 | | Learning Pathway | duration | link | Module 109 | | --- | --- | --- | --- | 110 | | ... | ... | ... | ... | 111 | it must contain a link for each line of the result in a table, 112 | consider the duration and Module information mentioned in the question, 113 | If you don't know the answer, don't make an entry in the table, 114 | {context} 115 | Question: {question} 116 | """ 117 | PROMPT = PromptTemplate(template=prompt_template, input_variables=["context","question"]) 118 | # The chain_type_kwargs are passed to the chain_type when it is created. 119 | self.chain_type_kwargs = {"prompt": PROMPT} 120 | # Create the GenAI project 121 | self.llm = OpenAI(temperature=1.0, openai_api_key=self.openai_api_key) 122 | # Get response for query 123 | # The response is returned as a string. 124 | 125 | def get_response_for(self, query: str): 126 | qa = RetrievalQA.from_chain_type( 127 | llm=self.llm, chain_type="stuff", 128 | retriever=self.faiss_vectorstore.as_retriever(), 129 | chain_type_kwargs=self.chain_type_kwargs 130 | ) 131 | return qa.run(query) 132 | 133 | def get_formatted_time(current_time = time.time()): 134 | return datetime.utcfromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S') 135 | 136 | # Load the model 137 | @st.cache_data 138 | def load_model(): 139 | start_time = time.time() 140 | print(f"\nStarted loading custom embeddings (created from .csv file) at {get_formatted_time(start_time)}") 141 | learningPathIndexEmbeddings = GenerateLearningPathIndexEmbeddings("Learning_Pathway_Index.csv") 142 | faiss_vectorstore = learningPathIndexEmbeddings.get_faiss_vector_store() 143 | end_time = time.time() 144 | print(f"Finished loading custom embeddings (created from .csv file) at {get_formatted_time(end_time)}") 145 | print(f"Custom embeddings (created from .csv file) took about {end_time - start_time} seconds to load.") 146 | return faiss_vectorstore 147 | 148 | # Query the model 149 | def query_gpt_model(query: str): 150 | start_time = time.time() 151 | print(f"\nQuery processing start time: {get_formatted_time(start_time)}") 152 | genAIproject = GenAILearningPathIndex(faiss_vectorstore) 153 | answer = genAIproject.get_response_for(query) 154 | end_time = time.time() 155 | print(f"\nQuery processing finish time: {get_formatted_time(end_time)}") 156 | print(f"\nAnswer (took about {end_time - start_time} seconds)") 157 | return answer 158 | 159 | 160 | if __name__=='__main__': 161 | faiss_vectorstore = load_model() 162 | 163 | if running_inside_streamlit(): 164 | print("\nStreamlit environment detected. \nTo run a CLI interactive version just run `python main.py` in the CLI.\n") 165 | query_from_stream_list = app() 166 | if query_from_stream_list: 167 | answer = query_gpt_model(query_from_stream_list) 168 | st.write(answer) 169 | else: 170 | print("\nCommand-line interactive environment detected.\n") 171 | while True: 172 | query = input("\nEnter a query: ") 173 | if query == "exit": 174 | break 175 | if query.strip() == "": 176 | continue 177 | 178 | if query: 179 | answer = query_gpt_model(query) 180 | 181 | print("\n\n> Question:") 182 | print(query) 183 | print(answer) 184 | -------------------------------------------------------------------------------- /app/llm-poc-variant-02/learning_path_index_contextual_search.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import os\n","from dotenv import load_dotenv\n","from datetime import datetime\n","import time\n","from langchain.llms import OpenAI\n","from langchain.document_loaders import TextLoader\n","from langchain.text_splitter import CharacterTextSplitter\n","from langchain.embeddings.openai import OpenAIEmbeddings\n","from langchain.chains import RetrievalQA\n","from langchain.prompts import PromptTemplate\n","from langchain.llms import OpenAI\n","from langchain.vectorstores import FAISS\n","from langchain.prompts import PromptTemplate"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from interface import app\n","import streamlit as st\n","# Define GenerateLearningPathIndexEmbeddings class: \n","# - Load .csv file\n","# - Chunk text\n","# - Chunk size = 1000 characters\n","# - Chunk overlap = 30 characters\n","# - Create FAISS vector store from chunked text and OpenAI embeddings\n","# - Get FAISS vector store\n","# This class is used to generate the FAISS vector store from the .csv file.\n","class GenerateLearningPathIndexEmbeddings:\n"," def __init__(self, csv_filename):\n"," load_dotenv() # Load .env file\n"," self.openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n"," self.data_path = os.path.join('..\\..\\data', csv_filename)\n"," self.our_custom_data = None\n"," self.openai_embeddings = None\n"," self.faiss_vectorstore = None\n"," self.load_csv_data()\n"," self.get_openai_embeddings()\n"," self.create_faiss_vectorstore_with_csv_data_and_openai_embeddings()\n"," \n"," def load_csv_data(self):\n"," # Load your dataset (e.g., CSV, JSON, etc.)\n"," print(' -- Started loading .csv file for chunking purposes.')\n"," loader = TextLoader(self.data_path)\n"," document = loader.load()\n"," text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=30, separator=\"\\n\")\n"," self.our_custom_data = text_splitter.split_documents(document)\n"," print(f' -- Finished spitting (i.e. chunking) text (i.e. documents) from the .csv file (i.e. {self.data_path}).')\n"," \n"," def get_openai_embeddings(self):\n"," self.openai_embeddings = OpenAIEmbeddings(openai_api_key=self.openai_api_key, request_timeout=60)\n"," \n"," def create_faiss_vectorstore_with_csv_data_and_openai_embeddings(self):\n"," faiss_vectorstore_foldername = \"faiss_learning_path_index\"\n"," if not os.path.exists(faiss_vectorstore_foldername):\n"," print(' -- Creating a new FAISS vector store from chunked text and OpenAI embeddings.')\n"," vectorstore = FAISS.from_documents(self.our_custom_data, self.openai_embeddings)\n"," vectorstore.save_local(faiss_vectorstore_foldername)\n"," print(f' -- Saved the newly created FAISS vector store at \"{faiss_vectorstore_foldername}\".')\n"," else:\n"," print(f' -- WARNING: Found existing FAISS vector store at \"{faiss_vectorstore_foldername}\", loading from cache.')\n"," print(f' -- NOTE: Delete the FAISS vector store at \"{faiss_vectorstore_foldername}\", if you wish to regenerate it from scratch for the next run.')\n"," self.faiss_vectorstore = FAISS.load_local(\n"," \"faiss_learning_path_index\", self.openai_embeddings\n"," )\n"," def get_faiss_vector_store(self):\n"," return self.faiss_vectorstore"]},{"cell_type":"markdown","metadata":{},"source":["https://discuss.streamlit.io/t/how-to-check-if-code-is-run-inside-streamlit-and-not-e-g-ipython/23439/7"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["def running_inside_streamlit():\n"," \"\"\"\n"," Function to check whether python code is run within streamlit\n"," Returns\n"," -------\n"," use_streamlit : boolean\n"," True if code is run within streamlit, else False\n"," \"\"\"\n"," try:\n"," from streamlit.runtime.scriptrunner import get_script_run_ctx\n"," if not get_script_run_ctx():\n"," use_streamlit = False\n"," else:\n"," use_streamlit = True\n"," except ModuleNotFoundError:\n"," use_streamlit = False\n"," return use_streamlit"]},{"cell_type":"markdown","metadata":{},"source":["Define GenAI class:
\n"," - Create prompt template
\n"," - Create GenAI project
\n"," - Get response for query
\n","This class is used to get the response for a query from the GenAI project.
\n","The GenAI project is created from the FAISS vector store."]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["class GenAILearningPathIndex:\n"," def __init__(self, faiss_vectorstore):\n"," load_dotenv() # Load .env file\n"," self.openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n"," self.faiss_vectorstore = faiss_vectorstore\n"," prompt_template = \\\n"," \"\"\"\n"," Use the following template to answer the question at the end, \n"," from the Learning Path Index csv file,\n"," display top 10 results in a tablular format and it \n"," should look like this:\n"," | Learning Pathway | duration | link | Module\n"," | --- | --- | --- | --- |\n"," | ... | ... | ... | ... |\n"," it must contain a link for each line of the result in a table,\n"," consider the duration and Module information mentioned in the question,\n"," If you don't know the answer, don't make an entry in the table,\n"," {context}\n"," Question: {question}\n"," \"\"\"\n"," PROMPT = PromptTemplate(template=prompt_template, input_variables=[\"context\",\"question\"])\n"," # The chain_type_kwargs are passed to the chain_type when it is created.\n"," self.chain_type_kwargs = {\"prompt\": PROMPT}\n"," # Create the GenAI project \n"," self.llm = OpenAI(temperature=1.0, openai_api_key=self.openai_api_key)\n"," # Get response for query\n"," # The response is returned as a string. \n"," \n"," def get_response_for(self, query: str):\n"," qa = RetrievalQA.from_chain_type(\n"," llm=self.llm, chain_type=\"stuff\", \n"," retriever=self.faiss_vectorstore.as_retriever(),\n"," chain_type_kwargs=self.chain_type_kwargs\n"," )\n"," return qa.run(query)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["def get_formatted_time(current_time = time.time()):\n"," return datetime.utcfromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')"]},{"cell_type":"markdown","metadata":{},"source":[" Load the model"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["@st.cache_data\n","def load_model():\n"," start_time = time.time()\n"," print(f\"\\nStarted loading custom embeddings (created from .csv file) at {get_formatted_time(start_time)}\")\n"," learningPathIndexEmbeddings = GenerateLearningPathIndexEmbeddings(\"Learning_Pathway_Index.csv\")\n"," faiss_vectorstore = learningPathIndexEmbeddings.get_faiss_vector_store()\n"," end_time = time.time()\n"," print(f\"Finished loading custom embeddings (created from .csv file) at {get_formatted_time(end_time)}\")\n"," print(f\"Custom embeddings (created from .csv file) took about {end_time - start_time} seconds to load.\")\n"," return faiss_vectorstore"]},{"cell_type":"markdown","metadata":{},"source":[" Query the model"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["def query_gpt_model(query: str):\n"," start_time = time.time()\n"," print(f\"\\nQuery processing start time: {get_formatted_time(start_time)}\")\n"," genAIproject = GenAILearningPathIndex(faiss_vectorstore)\n"," answer = genAIproject.get_response_for(query)\n"," end_time = time.time()\n"," print(f\"\\nQuery processing finish time: {get_formatted_time(end_time)}\")\n"," print(f\"\\nAnswer (took about {end_time - start_time} seconds)\")\n"," return answer"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["if __name__=='__main__':\n"," faiss_vectorstore = load_model()\n"," if running_inside_streamlit():\n"," print(\"\\nStreamlit environment detected. \\nTo run a CLI interactive version just run `python main.py` in the CLI.\\n\")\n"," query_from_stream_list = app()\n"," if query_from_stream_list:\n"," answer = query_gpt_model(query_from_stream_list)\n"," st.write(answer)\n"," else:\n"," print(\"\\nCommand-line interactive environment detected.\\n\")\n"," while True:\n"," query = input(\"\\nEnter a query: \")\n"," if query == \"exit\":\n"," break\n"," if query.strip() == \"\":\n"," continue\n"," if query:\n"," answer = query_gpt_model(query)\n"," print(\"\\n\\n> Question:\")\n"," print(query)\n"," print(answer)"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.4"}},"nbformat":4,"nbformat_minor":2} 2 | -------------------------------------------------------------------------------- /app/llm-poc-variant-01/README.md: -------------------------------------------------------------------------------- 1 | # lpiGPT - Learning Path Index GPT 2 | 3 | Ever thought you could ask/query a GPT about a course or smaller module of a course and have it find such bits of learning material across multiple sources of courses. 4 | 5 | A standalone GPT app based on [Ollama](https://github.com/jmorganca/ollama) and the [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset). 6 | 7 | It's simple and runs on the local machine with smaller sized and free LLMs. 8 | 9 | > Note: credits to this program goes to the original authors of [langchain-python-rag-privategpt](https://github.com/jmorganca/ollama/tree/main/examples/langchain-python-rag-privategpt) from Ivan Martinez who contributed to an example on [jmorganca/ollama](https://github.com/jmorganca/ollama). 10 | 11 | 12 | ## Table of Contents 13 | 14 | - [Requirements](#requirements) 15 | - [Installation](#installation) 16 | - [Setup](#setup) 17 | - [Downloading Learning Path Index datasets](#downloading-learning-path-index-datasets) 18 | - [Ingesting files](#ingesting-files) 19 | - [via native shell CLI](#via-native-shell-cli) 20 | - [Usage](#usage) 21 | - [Ask questions](#ask-questions) 22 | - [via native shell CLI](#via-native-shell-cli-1) 23 | - [via Docker container](#via-docker-container) 24 | - [Try a different model](#try-a-different-model) 25 | - [Adding more files](#adding-more-files) 26 | - [Models](#models) 27 | - [Embeddings models](#embeddings-models) 28 | - [Chat models](#chat-models) 29 | - [Known issues](#known-issues) 30 | - [Contributing](#contributing) 31 | - [License](#license) 32 | 33 | ## Requirements 34 | 35 | List out the key requirements needed to run the project, such as: 36 | 37 | - System requirements: 38 | - Quadcore Intel CPU 2.3Ghz or higher, 16-32GB RAM, 100 GB Free diskspace 39 | - Preferrable Linux or macOS 40 | - Python 3.9 41 | - [pyenv](https://github.com/pyenv/pyenv) 42 | - or venv 43 | - or [pipenv](https://pipenv.pypa.io/en/latest/) 44 | - Docker (optional) 45 | - Ollama ([Download & Install(https://ollama.com/download)) 46 | - Windows: 47 | - Microsoft Visual C++ 14.0 or greater is required (needed when installing ```hnswlib``` ) 48 | 49 | ## Installation 50 | 51 | Install [Ollama](https://github.com/jmorganca/ollama) using the below command on the host/local machine: 52 | 53 | ```bash 54 | curl https://ollama.ai/install.sh | sh 55 | ``` 56 | 57 | Pull the model you'd like to use: 58 | 59 | ```shell 60 | ollama pull llama2-uncensored 61 | ``` 62 | 63 | Set up a virtual environment (or use the [Docker route](#via-docker-container)): 64 | 65 | ```shell 66 | python3 -m venv .venv 67 | source .venv/bin/activate 68 | ``` 69 | 70 | Please note there are other options to use as well i.e. Conda, venv, virtualenv, poetry, etc. to isolate your development environments. 71 | 72 | For Windows, download Microsoft Visual C++ 14.0 or greater ([Link](https://visualstudio.microsoft.com/visual-cpp-build-tools/)). During installation, ensure that "Desktop development with C++" is selected. 73 | 74 | Install the Python dependencies: 75 | 76 | ```shell 77 | pip install -r requirements.txt 78 | ``` 79 | 80 | If you haven't installed Ollama yet, refer to the [Ollama repository](https://github.com/ollama/ollama) for installation instructions. 81 | 82 | Pull the model you'd like to use: 83 | 84 | ```shell 85 | ollama pull llama2-uncensored 86 | ``` 87 | 88 | and start the Ollama server 89 | 90 | ```shell 91 | ollama serve 92 | ``` 93 | 94 | 95 | ## Setup 96 | 97 | ### Downloading Learning Path Index datasets 98 | 99 | ```bash 100 | mkdir -p source_documents 101 | 102 | curl https://raw.githubusercontent.com/neomatrix369/learning-path-index/main/data/Courses_and_Learning_Material.csv -o "source_documents/Courses_and_Learning_Material.csv" 103 | 104 | curl https://raw.githubusercontent.com/neomatrix369/learning-path-index/main/data/Learning_Pathway_Index.csv -o "source_documents/Learning_Pathway_Index.csv" 105 | ``` 106 | 107 | Or you can manually download them from the [Kaggle Dataset: Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset). 108 | 109 | ### Ingesting files 110 | 111 | #### via native shell CLI 112 | 113 | ```shell 114 | python3 ingest.py 115 | ``` 116 | 117 | Output should look like this: 118 | 119 | ```shell 120 | root@sai-XPS-15-9560:/home# python3 ingest.py 121 | Downloading (…)e9125/.gitattributes: 100%|███████████████████████████████████████████████████████████████████| 1.18k/1.18k [00:00<00:00, 2.07MB/s] 122 | Downloading (…)_Pooling/config.json: 100%|████████████████████████████████████████████████████████████████████████| 190/190 [00:00<00:00, 378kB/s] 123 | Downloading (…)7e55de9125/README.md: 100%|███████████████████████████████████████████████████████████████████| 10.6k/10.6k [00:00<00:00, 16.2MB/s] 124 | Downloading (…)55de9125/config.json: 100%|███████████████████████████████████████████████████████████████████████| 612/612 [00:00<00:00, 1.53MB/s] 125 | Downloading (…)ce_transformers.json: 100%|████████████████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 252kB/s] 126 | Downloading (…)125/data_config.json: 100%|███████████████████████████████████████████████████████████████████| 39.3k/39.3k [00:00<00:00, 29.4MB/s] 127 | Downloading pytorch_model.bin: 100%|█████████████████████████████████████████████████████████████████████████| 90.9M/90.9M [00:09<00:00, 9.11MB/s] 128 | Downloading (…)nce_bert_config.json: 100%|█████████████████████████████████████████████████████████████████████| 53.0/53.0 [00:00<00:00, 97.4kB/s] 129 | Downloading (…)cial_tokens_map.json: 100%|████████████████████████████████████████████████████████████████████████| 112/112 [00:00<00:00, 698kB/s] 130 | Downloading (…)e9125/tokenizer.json: 100%|█████████████████████████████████████████████████████████████████████| 466k/466k [00:00<00:00, 5.22MB/s] 131 | Downloading (…)okenizer_config.json: 100%|████████████████████████████████████████████████████████████████████████| 350/350 [00:00<00:00, 627kB/s] 132 | Downloading (…)9125/train_script.py: 100%|███████████████████████████████████████████████████████████████████| 13.2k/13.2k [00:00<00:00, 21.1MB/s] 133 | Downloading (…)7e55de9125/vocab.txt: 100%|█████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 10.7MB/s] 134 | Downloading (…)5de9125/modules.json: 100%|████████████████████████████████████████████████████████████████████████| 349/349 [00:00<00:00, 721kB/s] 135 | Creating new vectorstore 136 | Loading documents from source_documents 137 | Loading new documents: 100%|██████████████████████| 2/2 [00:00<00:00, 40.44it/s] 138 | Loaded 1414 new documents from source_documents 139 | Split into 2214 chunks of text (max. 500 tokens each) 140 | Creating embeddings. May take some minutes... 141 | Ingestion complete! You can now run lpiGPT.py to query your documents 142 | ``` 143 | 144 | ```bash 145 | usage: ingest.py [-h] [--embeddings-model-name EMBEDDINGS_MODEL_NAME] [--source-documents SOURCE_DOCUMENTS] [--persist-directory PERSIST_DIRECTORY] 146 | [--target-source-chunks TARGET_SOURCE_CHUNKS] [--chunk-overlap CHUNK_OVERLAP] 147 | 148 | ingest: ingest: process one or more documents (text) in order to create embeddings (using the Embeddings models) from them, and make them ready to be used with LLMs when a question is asked to the InstructGPT or Chat Model. 149 | 150 | optional arguments: 151 | -h, --help show this help message and exit 152 | --embeddings-model-name EMBEDDINGS_MODEL_NAME, -EM EMBEDDINGS_MODEL_NAME 153 | Use this flag to set the Embeddings model name, see https://www.sbert.net/docs/pretrained_models.html for examples of names. Use the same model 154 | when running the lpiGPT.py app. 155 | --source-documents SOURCE_DOCUMENTS, -S SOURCE_DOCUMENTS 156 | Use this flag to specify the name of the folder where all the (source/input) documents are stored for ingestion purposes, on the local machine. The 157 | documents contained in them are of the type `.csv`. 158 | --persist-directory PERSIST_DIRECTORY, -P PERSIST_DIRECTORY 159 | Use this flag to specify the name of the vector database, this will be a folder on the local machine. 160 | --target-source-chunks TARGET_SOURCE_CHUNKS, -C TARGET_SOURCE_CHUNKS 161 | Use this flag to specify the name chunk size to use to chunk source data. 162 | --chunk-overlap CHUNK_OVERLAP, -O CHUNK_OVERLAP 163 | Use this flag to specify the name chunk overlap value to use to chunk source data. 164 | ``` 165 | 166 | #### Known issues 167 | 168 | - When trying to ingest and also run the GPT app, we can get this error on system with Python 3.10 or older 169 | 170 | ```python 171 | RuntimeError: Your system has an unsupported version of sqlite3. Chroma requires sqlite3 >= 3.35.0. 172 | ``` 173 | 174 | If this occurs then use the Docker container to run your commands, instructions are given below under each sub-section. 175 | 176 | [back to ToC](#table-of-contents) 177 | 178 | ## Usage 179 | 180 | ### Ask questions 181 | 182 | #### via native shell CLI 183 | 184 | Before running ```lpiGPT.py```you need to specify the base URL for the Ollama API or the local instance of Ollama running on your machine. By default this will return a ```None``` value. 185 | 186 | - Windows: 187 | - This is typically http://localhost:11434 and can be set by using the following in command line: 188 | ```shell 189 | set OLLAMA_HOST=http://localhost:11434 190 | ``` 191 | 192 | ```shell 193 | python3 lpiGPT.py 194 | 195 | Enter a query: Fetch me all machine learning courses of the advanced level from the Learning Path Index and show me results in a tabular form 196 | 197 | Start time: 2023-10-07 16:14:18 198 | > Question: 199 | Fetch me all machine learning courses of the advanced level from the Learning Path Index and show me results in a tabular form 200 | End time: 2023-10-07 16:17:19 201 | Answer (took about 181.3118166923523 seconds): 202 | | Course Name | Level | Type | Duration | Module / Sub-module | Keywords/Tags/Skills/Interests/Categories | Links | 203 | |-------------------------------|--------|-------|----------|--------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------| 204 | 1. Machine Learning Engineer Learning Path | Intermediate to Advanced | Free during mentorship period | AI Foundations: Quiz | Machine Learning/ Cloud/Data/Infrastructure/Bigquery/| https://www.cloudskillsboost.google/course_sessions/4968855/quizzes/387518 205 | 2. Machine Learning Engineer Learning Path | Intermediate to Advanced | Free during mentorship period | AI Development Workflow: Quiz | AI/Development/API/Vertex AI/MLOps/Workflow| https://www.cloudskillsboost.google/course_sessions/4968855/quizzes/387541 206 | 3. Machine Learning Engineer Learning Path | Intermediate to Advanced | Free during mentorship period | AI Development Options: Quiz | AI/Development/API/Vertex AI/AutoML/Workflow| https://www.cloudskillsboost.google/course_sessions/4968855/quizzes/387529 207 | 4. Machine Learning Engineer Learning Path | Intermediate to Advanced | Free during mentorship period | BigQuery Machine Learning: Develop ML Models Where Your Data Lives: Introduction | Big Query/Explanable AI/ML models/Hyperparameter. tuning/recommendation system| https://www.cloudskillsboost.google/course_sessions/4968855/quizzes/387530 208 | Note: The results will be displayed in a table format with columns for Course Name, Level, Type, Duration, Module / Sub-module, Keywords/Tags/Skills/Interests/Categories and Links. 209 | 210 | . 211 | . 212 | . 213 | [A list of source documents it got the results from] 214 | . 215 | . 216 | . 217 | ``` 218 | 219 | To exit the GPT prompt, press Ctrl-C or Ctrl-D and it will return to the Linux/Command-prompt. 220 | 221 | 222 | ```bash 223 | > python3 lpiGPT.py --help 224 | usage: lpiGPT.py [-h] [--chat-model CHAT_MODEL] [--embeddings-model-name EMBEDDINGS_MODEL_NAME] [--persist-directory PERSIST_DIRECTORY] 225 | [--target-source-chunks TARGET_SOURCE_CHUNKS] [--hide-source] [--mute-stream] 226 | 227 | lpiGPT: Ask questions to your documents without an internet connection, the power of LLMs (the InstructGPT or Chat model). 228 | 229 | optional arguments: 230 | -h, --help show this help message and exit 231 | --chat-model CHAT_MODEL, -CM CHAT_MODEL 232 | Use this flag to set the InstructGPT or Chat model name, see https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard or 233 | https://ollama.ai/library for more names. 234 | --embeddings-model-name EMBEDDINGS_MODEL_NAME, -EM EMBEDDINGS_MODEL_NAME 235 | Use this flag to set the Embeddings model name, see https://www.sbert.net/docs/pretrained_models.html for examples of names. Use the same model as 236 | used for ingesting the documents (ingest.py) 237 | --persist-directory PERSIST_DIRECTORY, -P PERSIST_DIRECTORY 238 | Use this flag to specify the name of the vector database, this will be a folder on the local machine. 239 | --target-source-chunks TARGET_SOURCE_CHUNKS, -C TARGET_SOURCE_CHUNKS 240 | Use this flag to specify the name chunk size to use to chunk source data. 241 | --hide-source, -S Use this flag to disable printing of source documents used for answers. 242 | --mute-stream, -M Use this flag to disable the streaming StdOut callback for LLMs. 243 | ``` 244 | 245 | #### via Docker container 246 | 247 | You can also setup an isolated environment i.e. inside Docker container and perform the same above operations 248 | 249 | ```shell 250 | cd docker 251 | ./build-docker-image.sh 252 | ``` 253 | 254 | when finished with building the container run the below 255 | 256 | ```shell 257 | ./run-docker-container.sh 258 | ``` 259 | 260 | you will get a prompt like this: 261 | 262 | ```shell 263 | root@[your machine name]:/home#: 264 | ``` 265 | 266 | in there, type the same commands as in the **via native shell CLI** sections of [Ingesting files](#ingesting-files) and [Ask questions](#ask-questions) respectively. 267 | 268 | 269 | ### Try a different model 270 | 271 | ```shell 272 | ollama pull llama2:13b 273 | python3 lpiGPT.py --chat-model=llama2:13b 274 | ``` 275 | 276 | ### Adding more files 277 | 278 | Put any and all your files into the `source_documents` directory 279 | 280 | The supported extensions are: 281 | 282 | - `.csv`: CSV 283 | and others, we have trimmed them off from here to keep this example simple and concise. 284 | 285 | [back to ToC](#table-of-contents) 286 | 287 | ## Models 288 | 289 | ### Embeddings models 290 | 291 | For embeddings model, the example uses a sentence-transformers model https://www.sbert.net/docs/pretrained_models.html 292 | The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2` is 5 times faster and still offers good quality. 293 | 294 | ### Chat models 295 | 296 | For chat models, have a look at [this list](https://github.com/jmorganca/ollama/#model-library) on [Ollama's github repo](https://github.com/jmorganca/ollama/). The list is basic, hence other LLM resources must be consulted i.e. 297 | 298 | - [Kaggle models](https://www.kaggle.com/models?query=LLM) 299 | - [HuggingFace models](https://huggingface.co/models?other=LLM) 300 | - ...(others).. 301 | 302 | _Please share your resources on either or both of the Embeddings and Chat models with us_ 303 | 304 | ## Contributing 305 | 306 | We are open to any or all of the below from your side in terms of contributions: 307 | 308 | - Reporting issues 309 | - Submitting pull requests 310 | - Coding standards or guidelines 311 | - Testing requirements 312 | 313 | ## License 314 | 315 | See [LICENSE](https://github.com/neomatrix369/learning-path-index/blob/main/LICENSE) in the root folder of the project 316 | 317 | [back to ToC](#table-of-contents) 318 | -------------------------------------------------------------------------------- /data/dataset-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "errorMessageNullable": null, 3 | "info": { 4 | "datasetSlugNullable": "learning-path-index-dataset", 5 | "ownerUserNullable": "neomatrix369", 6 | "usabilityRatingNullable": 1.0, 7 | "titleNullable": "Learning Path Index Dataset", 8 | "subtitleNullable": "A comprehensive dataset of Data Science, ML and AI learning paths and courses", 9 | "descriptionNullable": "# Description\nThe **Learning Path Index Dataset** is a comprehensive collection of byte-sized courses and learning materials tailored for individuals eager to delve into the fields of Data Science, Machine Learning, and Artificial Intelligence (AI), making it an indispensable reference for students, professionals, and educators in the Data Science and AI communities.\n\nThis **Kaggle Dataset** along with the KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) were created by the mentors and mentees of **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between _August 2023_ and _November 2023_, also [see this](https://www.kaggle.com/discussions/general/409607)). See **Credits** section at the bottom of the long description.\n\n# Inspiration\nThis dataset was created out of a commitment to facilitate learning and growth within the Data Science, Machine Learning, and AI communities. It started off as an idea at the end of **Cohort 2** of the [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) brainstorming and feedback session. It was one of the ideas to create byte-sized learning material to help our KaggleX mentees learn things faster. It aspires to simplify the process of finding, evaluating, and selecting the most fitting educational resources.\n\n# Context\nThis dataset was meticulously curated to assist learners in navigating the vast landscape of Data Science, Machine Learning, and AI education. It serves as a compass for those aiming to develop their skills and expertise in these rapidly evolving fields. \n\nThe mentors and mentees communicated via **Discord**, **Trello**, **Google Hangout**, etc... to put together these artifacts and made them public for everyone to _use and contribute back_.\n\n# Sources\nThe dataset compiles data from a curated selection of reputable sources including leading educational platforms such as **Google Developer, Google Cloud Skill Boost, IBM, Fast AI**, etc. By drawing from these trusted sources, we ensure that the data is both accurate and pertinent. The raw data and other artifacts as a result of this exercise can be found on the GitHub Repo i.e. KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index).\n\n# Content\nThe dataset encompasses the following attributes:\n\n- **Course / Learning Material:** The title of the Data Science, Machine Learning, or AI course or learning material.\n- **Source:** The provider or institution offering the course.\n- **Course Level:** The proficiency level, ranging from Beginner to Advanced.\n- **Type (Free or Paid):** Indicates whether the course is available for free or requires payment.\n- **Module:** Specific module or section within the course.\n- **Duration:** The estimated time required to complete the module or course.\n- **Module / Sub-module Difficulty Level:** The complexity level of the module or sub-module.\n- **Keywords / Tags / Skills / Interests / Categories:** Relevant keywords, tags, or categories associated with the course with a focus on Data Science, Machine Learning, and AI.\n- **Links:** Hyperlinks to access the course or learning material directly.\n\n# How to contribute to this initiative?\n\n- You can also join us by taking part in the next [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607))\n- Keep your eyes open on the **Kaggle Discussions** page and other **KaggleX** social media channels. Or find us on the [Kaggle Discord](https://www.kaggle.com/discussions/general/429933) channel to learn more about the next steps\n- Create notebooks from this data\n- Create supplementary or complementary data for or from this dataset\n- Submit corrections/enhancements or anything else to help improve this dataset so it has a wider use and purpose\n\n# License\nThe **Learning Path Index Dataset** is openly shared under a permissive license, allowing users to utilize the data for educational, analytical, and research purposes within the Data Science, Machine Learning, and AI domains. Feel free to _fork the dataset_ and make it your own, we would be delighted if you contributed back to the dataset and/or our KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) as well.\n\n# Important Links\n\n- [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607))\n- KaggleX [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset)\n- KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index)\n- [New Official Kaggle Discord Server!](https://www.kaggle.com/discussions/general/429933)\n\n# Credits\nCredits for all the work done to create this Kaggle Dataset and the KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) goes to these mentors and mentees (in no particular order): [Manish Kumar](https://www.kaggle.com/manishkr1754), [Ben Aji](https://www.kaggle.com/benajii) (_mentor_), [Emmanuel Katchy](https://www.kaggle.com/tobetek), [Ezeogu Ernest](https://www.kaggle.com/tobetek), [Manish](https://www.kaggle.com/manish5), [Mustafa](https://www.kaggle.com/mustafa254), [Nnamdi Idowu-Anifowoshe](https://www.kaggle.com/idowuchukwudi), [Sheba Alkali](https://www.kaggle.com/shebaalkali), [Zainab ikeoluwa](https://www.kaggle.com/zainabikeoluwa), [Wendy Mak](https://www.kaggle.com/wwymak) (_mentor_), [Misirya Hameed](https://www.linkedin.com/in/misiriya-shahul-hameed-b3957875) (_mentor_), [Chukwuebuka Obi](https://www.kaggle.com/chukwuebukaobi), [Victor Umunna](https://www.kaggle.com/victorumunna), [Pui Yueng](https://www.kaggle.com/lorentzyeung), [Afolake Solomon](https://www.kaggle.com/flakkyddon), [Faith Osoro](https://www.kaggle.com/faithosoro), [Chukwudi Idowu](https://www.kaggle.com/chukwudiidowu), and many others who were part of the **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between _August 2023_ and _November 2023_, also [see this](https://www.kaggle.com/discussions/general/409607)).\n\nOur gratitude also goes to our silent supporters of this initiative from organisers to the mentors and mentees whose help and support kept us going.\n\n_**Note:** In case your name or mention is missed out in the above list, then please let us know._", 10 | "datasetId": 3766406, 11 | "datasetSlug": "learning-path-index-dataset", 12 | "hasDatasetSlug": true, 13 | "ownerUser": "neomatrix369", 14 | "hasOwnerUser": true, 15 | "usabilityRating": 1.0, 16 | "hasUsabilityRating": true, 17 | "totalViews": 1779, 18 | "totalVotes": 32, 19 | "totalDownloads": 226, 20 | "title": "Learning Path Index Dataset", 21 | "hasTitle": true, 22 | "subtitle": "A comprehensive dataset of Data Science, ML and AI learning paths and courses", 23 | "hasSubtitle": true, 24 | "description": "# Description\nThe **Learning Path Index Dataset** is a comprehensive collection of byte-sized courses and learning materials tailored for individuals eager to delve into the fields of Data Science, Machine Learning, and Artificial Intelligence (AI), making it an indispensable reference for students, professionals, and educators in the Data Science and AI communities.\n\nThis **Kaggle Dataset** along with the KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) were created by the mentors and mentees of **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between _August 2023_ and _November 2023_, also [see this](https://www.kaggle.com/discussions/general/409607)). See **Credits** section at the bottom of the long description.\n\n# Inspiration\nThis dataset was created out of a commitment to facilitate learning and growth within the Data Science, Machine Learning, and AI communities. It started off as an idea at the end of **Cohort 2** of the [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) brainstorming and feedback session. It was one of the ideas to create byte-sized learning material to help our KaggleX mentees learn things faster. It aspires to simplify the process of finding, evaluating, and selecting the most fitting educational resources.\n\n# Context\nThis dataset was meticulously curated to assist learners in navigating the vast landscape of Data Science, Machine Learning, and AI education. It serves as a compass for those aiming to develop their skills and expertise in these rapidly evolving fields. \n\nThe mentors and mentees communicated via **Discord**, **Trello**, **Google Hangout**, etc... to put together these artifacts and made them public for everyone to _use and contribute back_.\n\n# Sources\nThe dataset compiles data from a curated selection of reputable sources including leading educational platforms such as **Google Developer, Google Cloud Skill Boost, IBM, Fast AI**, etc. By drawing from these trusted sources, we ensure that the data is both accurate and pertinent. The raw data and other artifacts as a result of this exercise can be found on the GitHub Repo i.e. KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index).\n\n# Content\nThe dataset encompasses the following attributes:\n\n- **Course / Learning Material:** The title of the Data Science, Machine Learning, or AI course or learning material.\n- **Source:** The provider or institution offering the course.\n- **Course Level:** The proficiency level, ranging from Beginner to Advanced.\n- **Type (Free or Paid):** Indicates whether the course is available for free or requires payment.\n- **Module:** Specific module or section within the course.\n- **Duration:** The estimated time required to complete the module or course.\n- **Module / Sub-module Difficulty Level:** The complexity level of the module or sub-module.\n- **Keywords / Tags / Skills / Interests / Categories:** Relevant keywords, tags, or categories associated with the course with a focus on Data Science, Machine Learning, and AI.\n- **Links:** Hyperlinks to access the course or learning material directly.\n\n# How to contribute to this initiative?\n\n- You can also join us by taking part in the next [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607))\n- Keep your eyes open on the **Kaggle Discussions** page and other **KaggleX** social media channels. Or find us on the [Kaggle Discord](https://www.kaggle.com/discussions/general/429933) channel to learn more about the next steps\n- Create notebooks from this data\n- Create supplementary or complementary data for or from this dataset\n- Submit corrections/enhancements or anything else to help improve this dataset so it has a wider use and purpose\n\n# License\nThe **Learning Path Index Dataset** is openly shared under a permissive license, allowing users to utilize the data for educational, analytical, and research purposes within the Data Science, Machine Learning, and AI domains. Feel free to _fork the dataset_ and make it your own, we would be delighted if you contributed back to the dataset and/or our KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) as well.\n\n# Important Links\n\n- [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607))\n- KaggleX [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset)\n- KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index)\n- [New Official Kaggle Discord Server!](https://www.kaggle.com/discussions/general/429933)\n\n# Credits\nCredits for all the work done to create this Kaggle Dataset and the KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) goes to these mentors and mentees (in no particular order): [Manish Kumar](https://www.kaggle.com/manishkr1754), [Ben Aji](https://www.kaggle.com/benajii) (_mentor_), [Emmanuel Katchy](https://www.kaggle.com/tobetek), [Ezeogu Ernest](https://www.kaggle.com/tobetek), [Manish](https://www.kaggle.com/manish5), [Mustafa](https://www.kaggle.com/mustafa254), [Nnamdi Idowu-Anifowoshe](https://www.kaggle.com/idowuchukwudi), [Sheba Alkali](https://www.kaggle.com/shebaalkali), [Zainab ikeoluwa](https://www.kaggle.com/zainabikeoluwa), [Wendy Mak](https://www.kaggle.com/wwymak) (_mentor_), [Misirya Hameed](https://www.linkedin.com/in/misiriya-shahul-hameed-b3957875) (_mentor_), [Chukwuebuka Obi](https://www.kaggle.com/chukwuebukaobi), [Victor Umunna](https://www.kaggle.com/victorumunna), [Pui Yueng](https://www.kaggle.com/lorentzyeung), [Afolake Solomon](https://www.kaggle.com/flakkyddon), [Faith Osoro](https://www.kaggle.com/faithosoro), [Chukwudi Idowu](https://www.kaggle.com/chukwudiidowu), and many others who were part of the **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between _August 2023_ and _November 2023_, also [see this](https://www.kaggle.com/discussions/general/409607)).\n\nOur gratitude also goes to our silent supporters of this initiative from organisers to the mentors and mentees whose help and support kept us going.\n\n_**Note:** In case your name or mention is missed out in the above list, then please let us know._", 25 | "hasDescription": true, 26 | "isPrivate": false, 27 | "keywords": [ 28 | "education", 29 | "artificial intelligence", 30 | "computer science", 31 | "programming", 32 | "beginner" 33 | ], 34 | "licenses": [ 35 | { 36 | "nameNullable": "Apache 2.0", 37 | "name": "Apache 2.0", 38 | "hasName": true 39 | } 40 | ], 41 | "collaborators": [ 42 | { 43 | "username": "manish5", 44 | "role": "reader" 45 | }, 46 | { 47 | "username": "mustafa254", 48 | "role": "reader" 49 | }, 50 | { 51 | "username": "benajii", 52 | "role": "reader" 53 | }, 54 | { 55 | "username": "zainabikeoluwa", 56 | "role": "reader" 57 | }, 58 | { 59 | "username": "shebaalkali", 60 | "role": "reader" 61 | }, 62 | { 63 | "username": "ernestdatascience", 64 | "role": "reader" 65 | }, 66 | { 67 | "username": "idowuchukwudi", 68 | "role": "reader" 69 | }, 70 | { 71 | "username": "tobetek", 72 | "role": "reader" 73 | }, 74 | { 75 | "username": "manishkr1754", 76 | "role": "writer" 77 | } 78 | ], 79 | "data": [ 80 | { 81 | "path": "Learning_Pathway_Index.csv", 82 | "description": "This file contains information about Data Science, Machine Learning, and AI courses and learning materials.", 83 | "schema": { 84 | "fields": [ 85 | { 86 | "name": "Module_Code", 87 | "description": "The course code of the course or learning material.", 88 | "type": "string" 89 | }, 90 | { 91 | "name": "Course_Learning_Material", 92 | "description": "The title of the course or learning material.", 93 | "type": "string" 94 | }, 95 | { 96 | "name": "Source", 97 | "description": "The provider or institution offering the course.", 98 | "type": "string" 99 | }, 100 | { 101 | "name": "Course_Level", 102 | "description": "The proficiency level, ranging from Beginner to Advanced.", 103 | "type": "string" 104 | }, 105 | { 106 | "name": "Type_Free_Paid", 107 | "description": "Indicates whether the course is available for free or requires payment.", 108 | "type": "string" 109 | }, 110 | { 111 | "name": "Module", 112 | "description": "Specific module or section within the course.", 113 | "type": "string" 114 | }, 115 | { 116 | "name": "Duration", 117 | "description": "The estimated time required to complete the module or course.", 118 | "type": "float" 119 | }, 120 | { 121 | "name": "Difficulty_Level", 122 | "description": "The complexity level of the module or sub-module.", 123 | "type": "string" 124 | }, 125 | { 126 | "name": "Keywords_Tags_Skills_Interests_Categories", 127 | "description": "Relevant keywords, tags, or categories associated with the course with a focus on Data Science, Machine Learning, and AI.", 128 | "type": "string" 129 | }, 130 | { 131 | "name": "Links", 132 | "description": "Hyperlinks to access the course or learning material directly.", 133 | "type": "string" 134 | } 135 | ] 136 | } 137 | }, 138 | { 139 | "path": "Courses_and_Learning_Material.csv", 140 | "description": "This file contains information about Data Science, Machine Learning, and AI courses and learning materials.", 141 | "schema": { 142 | "fields": [ 143 | { 144 | "name": "Module_Code", 145 | "description": "The course code of the course or learning material.", 146 | "type": "string" 147 | }, 148 | { 149 | "name": "Source", 150 | "description": "The provider or institution offering the course.", 151 | "type": "string" 152 | }, 153 | { 154 | "name": "Course_Level", 155 | "description": "The proficiency level, ranging from Beginner to Advanced.", 156 | "type": "string" 157 | }, 158 | { 159 | "name": "Duration", 160 | "description": "The estimated time required to complete the module or course.", 161 | "type": "string" 162 | }, 163 | { 164 | "name": "Prerequisites", 165 | "description": "One or more courses that need to be completed before a learner can enroll in or take the current course.", 166 | "type": "string" 167 | }, 168 | { 169 | "name": "Prework", 170 | "description": "Foundational knowledge or skills required for successful engagement with the course material.", 171 | "type": "string" 172 | }, 173 | { 174 | "name": "Course_Learning_Material", 175 | "description": "Course Title/Name of the Data Science, Machine Learning, or AI course or learning material.", 176 | "type": "string" 177 | }, 178 | { 179 | "name": "Course_Learning_Material_Link", 180 | "description": "Hyperlinks to access the course or learning material directly.", 181 | "type": "string" 182 | }, 183 | { 184 | "name": "Type_Free_Paid", 185 | "description": "Indicates whether the course is available for free or requires payment.", 186 | "type": "string" 187 | } 188 | ] 189 | } 190 | } 191 | ], 192 | }, 193 | "errorMessage": "", 194 | "hasErrorMessage": false 195 | } -------------------------------------------------------------------------------- /data/Courses_and_Learning_Material.csv: -------------------------------------------------------------------------------- 1 | Module_Code,Source,Course_Level,Duration,Prerequisites,Prework,Course_Learning_Material,Course_Learning_Material_Link,Type_Free_Paid 2 | CLMML00,Google Developers,Beginners,70 minutes,No,No,Introduction to Machine Learning,https://developers.google.com/machine-learning/intro-to-ml,Free 3 | CLMML01,Google Developers,Beginners to Intermediate,,"Yes, a handful ",Yes,Machine Learning Crash Course (Foundation),https://developers.google.com/machine-learning/crash-course,Free 4 | CLMML02,Google Developers,Beginners to Intermediate,45 minutes,No,No,Problem Framing (ML related),https://developers.google.com/machine-learning/problem-framing,Free 5 | CLMML03,Google Developers,Beginners to Intermediate,,No,No,Data Preparation and Feature Engineering in ML,https://developers.google.com/machine-learning/data-prep,Free 6 | CLMML04,Google Developers,Beginners to Intermediate,,Yes,No,Testing and Debugging,https://developers.google.com/machine-learning/testing-debugging,Free 7 | CLMML05,Google Developers,Intermediate to Advanced,,No,No,Decision Forests,https://developers.google.com/machine-learning/decision-forests,Free 8 | CLMML06,Google Developers,Intermediate to Advanced,,No,No,Recommendation Systems,https://developers.google.com/machine-learning/recommendation,Free 9 | CLMML07,Google Developers,Intermediate to Advanced,,No,No,Clustering,https://developers.google.com/machine-learning/clustering,Free 10 | CLMML08,Google Developers,Intermediate to Advanced,,No,No,Generative Adversarial Networks,https://developers.google.com/machine-learning/gan,Free 11 | CLMML09,Google Developers,Intermediate to Advanced,,No,No,Image Classification,https://developers.google.com/machine-learning/practica/image-classification,Free 12 | CLMML10,Google Developers,Intermediate to Advanced,,No,No,Fairness in Perspective API,https://developers.google.com/machine-learning/practica/fairness-indicators,Free 13 | CLMF001,Fast.ai,Intermediate to Advanced,,Yes,Varies,Fast.ai,https://course.fast.ai/,Free 14 | CLMAIE1,IBM,Beginners to Intermediate,,No,No,AI Ethics Resources,https://www.ibm.com/topics/ai-ethics,Free 15 | CLMML10,Google Developers,Intermediate to Advanced,,Yes,Yes,ML Engineer / Data Scientist Google Cloud Learning Path,https://docs.google.com/presentation/d/18dV09U9JqqB01RbMrfwRUB3LkSAu5L8R/edit#slide=id.p1,Free 16 | CLMML11,Google Cloud Skills Boost,Intermediate to Advanced,,Yes,Yes,Machine Learning Engineer Learning Path,https://www.cloudskillsboost.google/paths/17,Free 17 | CLMML12,Google Developers,Intermediate to Advanced,,Yes,Yes,Machine Learning Advance Courses,https://developers.google.com/machine-learning/advanced-courses,Free 18 | CLMK001,Kaggle Learn,Beginners to Intermediate,,No,No,Kaggle:Courses,https://www.kaggle.com/learn,Free 19 | CLMG001,Google Cloud Skills Boost,Intermediate,2 days,No,No,Data Engineer - Preparing for the Google Cloud Professional Data Engineer Exam,https://www.cloudskillsboost.google/course_templates/72?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346376,Free during the mentorship program 20 | CLMG002,Google Cloud Skills Boost,Beginners,1 day,No,No,Data Engineer - Google Cloud Big Data and Machine Learning Fundamentals,https://www.cloudskillsboost.google/course_templates/3?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346323,Free during the mentorship program 21 | CLMG003,Google Cloud Skills Boost,Beginners,2 days,No,No,Data Engineer - Modernizing Data Lakes and Data Warehouses with Google Cloud,https://www.cloudskillsboost.google/course_templates/54?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346330,Free during the mentorship program 22 | CLMG004,Google Cloud Skills Boost,Beginners,3 days,No,No,Data Engineer - Building Batch Data Pipelines on Google Cloud,https://www.cloudskillsboost.google/course_templates/53?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346338,Free during the mentorship program 23 | CLMG005,Google Cloud Skills Boost,Beginners,3 days,No,No,Data Engineer - Building Resilient Streaming Analytics Systems on Google Cloud,https://www.cloudskillsboost.google/course_templates/52?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346347,Free during the mentorship program 24 | CLMG006,Google Cloud Skills Boost,Beginners,5 days,No,No,"Data Engineer - Smart Analytics, Machine Learning, and AI on Google Cloud",https://www.cloudskillsboost.google/course_templates/55?catalog_rank=%7B%22rank%22%3A2%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346351,Free during the mentorship program 25 | CLMG007,Google Cloud Skills Boost,Intermediate,,No,No,Data Engineer - Serverless Data Processing with Dataflow: Foundations,https://www.cloudskillsboost.google/course_templates/218?catalog_rank=%7B%22rank%22%3A2%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346355,Free during the mentorship program 26 | CLMG008,Google Cloud Skills Boost,Advanced,,No,No,Data Engineer - Serverless Data Processing with Dataflow: Develop Pipelines,https://www.cloudskillsboost.google/course_templates/229?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346358,Free during the mentorship program 27 | CLMG009,Google Cloud Skills Boost,Advanced,,No,No,Data Engineer - Serverless Data Processing with Dataflow: Operations,https://www.cloudskillsboost.google/course_templates/264?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346360,Free during the mentorship program 28 | CLMG010,Google Cloud Skills Boost,Beginners,45 minutes,No,No,Data Engineer - Lab: A Tour of Google Cloud Hands-on Labs,https://www.cloudskillsboost.google/focuses/2794?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&parent=catalog&search_id=25346362,Free during the mentorship program 29 | CLMG011,Google Cloud Skills Boost,Beginners,1 hour,No,No,Data Engineer - Lab: Engineer Data in Google Cloud,https://www.cloudskillsboost.google/focuses/12379?catalog_rank=%7B%22rank%22%3A4%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&parent=catalog&search_id=25346300,Free during the mentorship program 30 | CLMG012,Google Cloud Skills Boost,Beginners,7 hours,No,No,"Data Engineer - Quest: Perform Foundational Data, ML, and AI Tasks in Google Cloud",https://www.cloudskillsboost.google/quests/117?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346364,Free during the mentorship program 31 | CLMG013,Google Cloud Skills Boost,Beginners,1 hour 30 minutes,No,No,Data Engineer - Quest: Build and Optimize Data Warehouses with BigQuery,https://www.cloudskillsboost.google/focuses/14341?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&parent=catalog&search_id=25346367,Free during the mentorship program 32 | CLMG014,Google Cloud Skills Boost,Intermediate,6 hours,No,No,Data Engineer - Quest: Engineer Data in Google Cloud,https://www.cloudskillsboost.google/quests/132?catalog_rank=%7B%22rank%22%3A2%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346300,Free during the mentorship program 33 | CLMG015,Google Cloud Skills Boost,Intermediate,4 hours,No,No,Data Engineer - Quest: Data Engineer,https://www.cloudskillsboost.google/quests/25?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346300,Free during the mentorship program 34 | CLMGA01,Google Cloud Skills Boost,Beginner,4 hours,No,No,Introduction to Generative AI Learning Path,https://www.cloudskillsboost.google/journeys/118,Free 35 | CLMGA02,Google Cloud Skills Boost,Beginner to Intermediate,10 hours,Yes,No,Generative AI for Developers Learning Path,https://www.cloudskillsboost.google/journeys/183,Free 36 | DLA:ai-for-everyone,DeepLearning.AI,Beginner,6 hours,No,Yes,AI for Everyone,https://www.deeplearning.ai//courses/ai-for-everyone,Free 37 | DLA:ai-for-medicine-specialization,DeepLearning.AI,Intermediate,3 months,Yes,Yes,AI for Medicine,https://www.deeplearning.ai//courses/ai-for-medicine-specialization,Paid 38 | DLA:deep-learning-specialization,DeepLearning.AI,Intermediate,6 months,Yes,Yes,Deep Learning Specialization,https://www.deeplearning.ai//courses/deep-learning-specialization,Paid 39 | DLA:natural-language-processing-specialization,DeepLearning.AI,Intermediate,4 months,Yes,Yes,Natural Language Processing,https://www.deeplearning.ai//courses/natural-language-processing-specialization,Paid 40 | DLA:tensorflow-developer-professional-certificate,DeepLearning.AI,Intermediate,2 - 4 weeks,No,Yes,TensorFlow Developer Professional Certificate,https://www.deeplearning.ai//courses/tensorflow-developer-professional-certificate,Paid 41 | DLA:tensorflow-data-and-deployment-specialization,DeepLearning.AI,Intermediate,2 - 6 Months,Yes,Yes,TensorFlow: Data and Deployment,https://www.deeplearning.ai//courses/tensorflow-data-and-deployment-specialization,Paid 42 | DLA:generative-adversarial-networks-gans-specialization,DeepLearning.AI,Intermediate,3 months,Yes,Yes,Generative Adversarial Networks (GANs),https://www.deeplearning.ai//courses/generative-adversarial-networks-gans-specialization,Paid 43 | DLA:tensorflow-advanced-techniques-specialization,DeepLearning.AI,Intermediate,5 months,Yes,No,TensorFlow: Advanced Techniques,https://www.deeplearning.ai//courses/tensorflow-advanced-techniques-specialization,Paid 44 | DLA:machine-learning-specialization,"DeepLearning.AI,Stanford Online",Beginner,2.5 Months,Yes,Yes,Machine Learning Specialization,https://www.deeplearning.ai//courses/machine-learning-specialization,Paid 45 | DLA:mathematics-for-machine-learning-and-data-science-specialization,DeepLearning.AI,Beginner,,Yes,Yes,Mathematics for Machine Learning and Data Science,https://www.deeplearning.ai//courses/mathematics-for-machine-learning-and-data-science-specialization,Paid 46 | DLA:ai-for-good,DeepLearning.AI,Beginner,2 - 4 Weeks,Yes,Yes,AI for Good,https://www.deeplearning.ai//courses/ai-for-good,Paid 47 | DLA:chatgpt-prompt-engineering-for-developers,"OpenAI,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,ChatGPT Prompt Engineering for Developers,https://www.deeplearning.ai//short-courses/chatgpt-prompt-engineering-for-developers,Paid 48 | DLA:building-systems-with-chatgpt,"OpenAI,DeepLearning.AI",Beginner,1 hour,Yes,Yes,Building Systems with the ChatGPT API,https://www.deeplearning.ai//short-courses/building-systems-with-chatgpt,Paid 49 | DLA:langchain-for-llm-application-development,"LangChain,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,LangChain for LLM Application Development,https://www.deeplearning.ai//short-courses/langchain-for-llm-application-development,Paid 50 | DLA:how-diffusion-models-work,DeepLearning.AI,Intermediate,1 Hour,Yes,Yes,How Diffusion Models Work,https://www.deeplearning.ai//short-courses/how-diffusion-models-work,Paid 51 | DLA:generative-ai-with-llms,"AWS,DeepLearning.AI",Intermediate,,Yes,Yes,Generative AI with LLMs,https://www.deeplearning.ai//courses/generative-ai-with-llms,Paid 52 | DLA:langchain-chat-with-your-data,"LangChain,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,LangChain: Chat with Your Data,https://www.deeplearning.ai//short-courses/langchain-chat-with-your-data,Paid 53 | DLA:building-generative-ai-applications-with-gradio,"DeepLearning.AI,Hugging Face",Beginner,1 Hour,Yes,Yes,Building Generative AI Applications with Gradio,https://www.deeplearning.ai//short-courses/building-generative-ai-applications-with-gradio,Paid 54 | DLA:evaluating-debugging-generative-ai,"Weights & Biases,DeepLearning.AI",Intermediate,1 Hour,Yes,Yes,Evaluating and Debugging Generative AI Models Using Weights and Biases,https://www.deeplearning.ai//short-courses/evaluating-debugging-generative-ai,Paid 55 | DLA:large-language-models-semantic-search,"Cohere,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,Large Language Models with Semantic Search,https://www.deeplearning.ai//short-courses/large-language-models-semantic-search,Paid 56 | DLA:finetuning-large-language-models,"Lamini,DeepLearning.AI",Intermediate,1 Hour,Yes,Yes,Finetuning Large Language Models,https://www.deeplearning.ai//short-courses/finetuning-large-language-models,Paid 57 | DLA:microsoft-semantic-kernel,"Microsoft,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,How Business Thinkers Can Start Building AI Plugins With Semantic Kernel,https://www.deeplearning.ai//short-courses/microsoft-semantic-kernel,Paid 58 | DLA:google-cloud-vertex-ai,"DeepLearning.AI,Google Cloud",Beginner,1 Hour,Yes,Yes,Understanding and Applying Text Embeddings,https://www.deeplearning.ai//short-courses/google-cloud-vertex-ai,Paid 59 | DLA:pair-programming-llm,"Google,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,Pair Programming with a Large Language Model,https://www.deeplearning.ai//short-courses/pair-programming-llm,Paid 60 | DLA:generative-ai-for-everyone,DeepLearning.AI,Beginner,1 Hour,Yes,Yes,Generative AI for Everyone,https://www.deeplearning.ai//courses/generative-ai-for-everyone,Paid 61 | DLA:functions-tools-agents-langchain,"LangChain,DeepLearning.AI",Intermediate,1 Hour,Yes,Yes,"Functions, Tools and Agents with LangChain",https://www.deeplearning.ai//short-courses/functions-tools-agents-langchain,Paid 62 | DLA:vector-databases-embeddings-applications,"Weaviate,DeepLearning.AI",Intermediate,,Yes,Yes,Vector Databases: from Embeddings to Applications,https://www.deeplearning.ai//short-courses/vector-databases-embeddings-applications,Paid 63 | DLA:quality-safety-llm-applications,"WhyLabs,DeepLearning.AI",Beginner,,Yes,Yes,Quality and Safety for LLM Applications,https://www.deeplearning.ai//short-courses/quality-safety-llm-applications,Paid 64 | DLA:building-evaluating-advanced-rag,"LlamaIndex,DeepLearning.AI,TruEra",Beginner,,Yes,Yes,Building and Evaluating Advanced RAG Applications,https://www.deeplearning.ai//short-courses/building-evaluating-advanced-rag,Paid 65 | DLA:reinforcement-learning-from-human-feedback,"DeepLearning.AI,Google Cloud",Intermediate,,Yes,Yes,Reinforcement Learning from Human Feedback,https://www.deeplearning.ai//short-courses/reinforcement-learning-from-human-feedback,Paid 66 | DLA:advanced-retrieval-for-ai,"DeepLearning.AI,Chroma",Intermediate,,Yes,Yes,Advanced Retrieval for AI with Chroma,https://www.deeplearning.ai//short-courses/advanced-retrieval-for-ai,Paid 67 | DLA:build-llm-apps-with-langchain-js,"LangChain,DeepLearning.AI",Intermediate,,Yes,Yes,Build LLM Apps with LangChain.js,https://www.deeplearning.ai//short-courses/build-llm-apps-with-langchain-js,Paid 68 | DLA:llmops,"DeepLearning.AI,Google Cloud",Beginner,,Yes,Yes,LLMOps,https://www.deeplearning.ai//short-courses/llmops,Paid 69 | DLA:automated-testing-llmops,"DeepLearning.AI,CircleCI",Intermediate,,Yes,Yes,Automated Testing for LLMOps,https://www.deeplearning.ai//short-courses/automated-testing-llmops,Paid 70 | DLA:building-applications-vector-databases,"Pinecone,DeepLearning.AI",Beginner,,Yes,Yes,Building Applications with Vector Databases,https://www.deeplearning.ai//short-courses/building-applications-vector-databases,Paid 71 | DLA:serverless-llm-apps-amazon-bedrock,"AWS,DeepLearning.AI",Intermediate,,Yes,Yes,Serverless LLM apps with Amazon Bedrock,https://www.deeplearning.ai//short-courses/serverless-llm-apps-amazon-bedrock,Paid 72 | DLA:prompt-engineering-with-llama-2,"Meta,DeepLearning.AI",Beginner,,Yes,Yes,Prompt Engineering with Llama 2 & 3,https://www.deeplearning.ai//short-courses/prompt-engineering-with-llama-2,Paid 73 | DLA:open-source-models-hugging-face,"DeepLearning.AI,Hugging Face",Beginner,,Yes,Yes,Open Source Models with Hugging Face,https://www.deeplearning.ai//short-courses/open-source-models-hugging-face,Paid 74 | DLA:knowledge-graphs-rag,"DeepLearning.AI,Neo4j",Intermediate,,Yes,Yes,Knowledge Graphs for RAG,https://www.deeplearning.ai//short-courses/knowledge-graphs-rag,Paid 75 | DLA:efficiently-serving-llms,"DeepLearning.AI,Predibase",Intermediate,,Yes,Yes,Efficiently Serving LLMs,https://www.deeplearning.ai//short-courses/efficiently-serving-llms,Paid 76 | DLA:javascript-rag-web-apps-with-llamaindex,"LlamaIndex,DeepLearning.AI",Beginner,,Yes,Yes,JavaScript RAG Web Apps with LlamaIndex,https://www.deeplearning.ai//short-courses/javascript-rag-web-apps-with-llamaindex,Paid 77 | DLA:red-teaming-llm-applications,"DeepLearning.AI,Giskard",Beginner,,Yes,Yes,Red Teaming LLM Applications,https://www.deeplearning.ai//short-courses/red-teaming-llm-applications,Paid 78 | DLA:preprocessing-unstructured-data-for-llm-applications,"Unstructured,DeepLearning.AI",Beginner,,Yes,Yes,Preprocessing Unstructured Data for LLM Applications,https://www.deeplearning.ai//short-courses/preprocessing-unstructured-data-for-llm-applications,Paid 79 | DLA:quantization-fundamentals-with-hugging-face,"DeepLearning.AI,Hugging Face",Beginner,,Yes,Yes,Quantization Fundamentals with Hugging Face,https://www.deeplearning.ai//short-courses/quantization-fundamentals-with-hugging-face,Paid 80 | DLA:getting-started-with-mistral,"DeepLearning.AI,Mistral AI",Beginner,,Yes,Yes,Getting Started With Mistral,https://www.deeplearning.ai//short-courses/getting-started-with-mistral,Paid 81 | DLA:prompt-engineering-for-vision-models,"Comet,DeepLearning.AI",Beginner,,Yes,Yes,Prompt Engineering for Vision Models,https://www.deeplearning.ai//short-courses/prompt-engineering-for-vision-models,Paid 82 | DLA:quantization-in-depth,"DeepLearning.AI,Hugging Face",Intermediate,,Yes,Yes,Quantization in Depth,https://www.deeplearning.ai//short-courses/quantization-in-depth,Paid 83 | DLA:machine-learning-in-production,DeepLearning.AI,Intermediate,4 months,Yes,Yes,Machine Learning in Production,https://www.deeplearning.ai//courses/machine-learning-in-production,Paid 84 | DLA:building-agentic-rag-with-llamaindex,"LlamaIndex,DeepLearning.AI",Beginner,,Yes,Yes,Building Agentic RAG with LlamaIndex,https://www.deeplearning.ai//short-courses/building-agentic-rag-with-llamaindex,Paid 85 | DLA:building-multimodal-search-and-rag,"Weaviate,DeepLearning.AI",Intermediate,,Yes,Yes,Building Multimodal Search and RAG,https://www.deeplearning.ai//short-courses/building-multimodal-search-and-rag,Paid 86 | DLA:multi-ai-agent-systems-with-crewai,"DeepLearning.AI,crewAI",Beginner,,Yes,Yes,Multi AI Agent Systems with crewAI,https://www.deeplearning.ai//short-courses/multi-ai-agent-systems-with-crewai,Paid 87 | DLA:introduction-to-on-device-ai,"DeepLearning.AI,Qualcomm",Beginner,,Yes,Yes,Introduction to On-Device AI,https://www.deeplearning.ai//short-courses/introduction-to-on-device-ai,Paid 88 | DLA:ai-agentic-design-patterns-with-autogen,"Microsoft,DeepLearning.AI,Penn State University",Beginner,,Yes,Yes,AI Agentic Design Patterns with AutoGen,https://www.deeplearning.ai//short-courses/ai-agentic-design-patterns-with-autogen,Paid 89 | DLA:ai-agents-in-langgraph,"LangChain,Tavily,DeepLearning.AI",Intermediate,,Yes,Yes,AI Agents in LangGraph,https://www.deeplearning.ai//short-courses/ai-agents-in-langgraph,Paid 90 | DLA:building-your-own-database-agent,"Microsoft,DeepLearning.AI",Beginner,,Yes,No,Building Your Own Database Agent,https://www.deeplearning.ai//short-courses/building-your-own-database-agent,Paid 91 | DLA:function-calling-and-data-extraction-with-llms,"DeepLearning.AI,Nexusflow",Beginner,,Yes,Yes,Function-Calling and Data Extraction with LLMs,https://www.deeplearning.ai//short-courses/function-calling-and-data-extraction-with-llms,Paid 92 | DLA:carbon-aware-computing-for-genai-developers,"DeepLearning.AI,Google Cloud",Beginner,,Yes,Yes,Carbon Aware Computing for GenAI Developers,https://www.deeplearning.ai//short-courses/carbon-aware-computing-for-genai-developers,Paid 93 | DLA:prompt-compression-and-query-optimization,"DeepLearning.AI,MongoDB",Intermediate,,Yes,No,Prompt Compression and Query Optimization,https://www.deeplearning.ai//short-courses/prompt-compression-and-query-optimization,Paid 94 | DLA:generative-ai-for-software-development,DeepLearning.AI,Intermediate,15 hours,Yes,No,Generative AI for Software Development,https://www.deeplearning.ai//courses/generative-ai-for-software-development,Paid 95 | DLA:pretraining-llms,"DeepLearning.AI,Upstage",Intermediate,,Yes,No,Pretraining LLMs,https://www.deeplearning.ai//short-courses/pretraining-llms,Paid 96 | DLA:intro-to-federated-learning,"DeepLearning.AI,Flower Labs",Beginner,,Yes,Yes,Federated Learning,https://www.deeplearning.ai//short-courses/intro-to-federated-learning,Paid 97 | DLA:embedding-models-from-architecture-to-implementation,"DeepLearning.AI,Vectara",Beginner,,Yes,Yes,Embedding Models: From Architecture to Implementation,https://www.deeplearning.ai//short-courses/embedding-models-from-architecture-to-implementation,Paid 98 | DLA:data-engineering,DeepLearning.AI,Intermediate,15 weeks,Yes,Yes,Data Engineering,https://www.deeplearning.ai//courses/data-engineering,Paid 99 | DLA:ai-python-for-beginners,DeepLearning.AI,Beginner,,Yes,Yes,AI Python for Beginners,https://www.deeplearning.ai//short-courses/ai-python-for-beginners,Paid 100 | --------------------------------------------------------------------------------