├── app
    ├── .gitkeep
    ├── llm-gemma-variant
    │   ├── readme.md
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── test.py
    │   │   ├── model.py
    │   │   ├── backend.py
    │   │   └── vector_db.py
    │   ├── tests
    │   │   └── __init__.py
    │   ├── makefile
    │   └── pyproject.toml
    ├── course-scraper
    │   ├── src
    │   │   ├── scrapers
    │   │   │   ├── kaggle_learn
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── pages.py
    │   │   │   │   ├── README.md
    │   │   │   │   ├── scrape_course.py
    │   │   │   │   ├── scrape_all_courses.py
    │   │   │   │   └── models.py
    │   │   │   └── google_cloud_skill_boost
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── scrape_focus.py
    │   │   │   │   ├── models.py
    │   │   │   │   ├── pages.py
    │   │   │   │   ├── README.md
    │   │   │   │   ├── scrape_journey.py
    │   │   │   │   └── scrape_course_template.py
    │   │   ├── config.py
    │   │   ├── gsheet.py
    │   │   └── utils.py
    │   ├── requirements.txt
    │   └── README.md
    ├── llm-poc-variant-01
    │   ├── deploy
    │   │   ├── aws
    │   │   │   ├── .gitignore
    │   │   │   ├── keypair.tf
    │   │   │   ├── provider.tf
    │   │   │   ├── security_groups.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── main.tf
    │   │   │   └── README.md
    │   │   └── gcp
    │   │   │   ├── .gitignore
    │   │   │   ├── project.tfvars
    │   │   │   ├── chainlit-app-demo.gif
    │   │   │   ├── ollama.service
    │   │   │   ├── provider.tf
    │   │   │   ├── outputs.tf
    │   │   │   ├── main.tf
    │   │   │   └── README.md
    │   ├── .gitignore
    │   ├── docker
    │   │   ├── Dockerfile
    │   │   ├── run-docker-container.sh
    │   │   └── build-docker-image.sh
    │   ├── constants.py
    │   ├── chainlit_app.py
    │   ├── requirements.txt
    │   ├── lpiGPT.py
    │   ├── ingest.py
    │   └── README.md
    ├── Jorge_Rocha_campos_ML - Google Docs.pdf
    └── llm-poc-variant-02
    │   ├── .env_template
    │   ├── requirements.txt
    │   ├── interface.py
    │   ├── faiss_index.py
    │   ├── README.md
    │   ├── main.py
    │   └── learning_path_index_contextual_search.ipynb
├── chainlit.md
├── data
    ├── .gitkeep
    ├── utils
    │   ├── requirements.txt
    │   ├── get-kaggle-dataset-meta-data.py
    │   └── README.md
    ├── dataset-metadata.json
    └── Courses_and_Learning_Material.csv
├── docs
    └── .gitkeep
├── .github
    ├── CONTRIBUTING.md
    ├── workflows
    │   └── .gitkeep
    └── CODEOWNERS
├── requirements
    ├── llm-poc-variant-01.txt
    ├── base.txt
    ├── llm-poc-variant-02.txt
    └── scraper.txt
├── pyproject.toml
├── .pre-commit-config.yaml
├── LICENSE
├── getting-started.md
├── .gitignore
└── README.md


/app/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/chainlit.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/workflows/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @neomatrix369


--------------------------------------------------------------------------------
/app/llm-gemma-variant/readme.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/llm-gemma-variant/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/llm-gemma-variant/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/utils/requirements.txt:
--------------------------------------------------------------------------------
1 | kaggle==1.5.16


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/kaggle_learn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/google_cloud_skill_boost/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/google_cloud_skill_boost/scrape_focus.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/aws/.gitignore:
--------------------------------------------------------------------------------
1 | .terraform*
2 | terraform.*


--------------------------------------------------------------------------------
/app/llm-gemma-variant/makefile:
--------------------------------------------------------------------------------
1 | run-gemma:
2 | 	poetry run python src/backend.py


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/.gitignore:
--------------------------------------------------------------------------------
1 | source_documents
2 | vector_db
3 | .python-version


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/gcp/.gitignore:
--------------------------------------------------------------------------------
1 | .terraform*
2 | terraform.*
3 | .service_account_credentials.json
4 | *.plan
5 | 


--------------------------------------------------------------------------------
/app/course-scraper/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neomatrix369/learning-path-index/HEAD/app/course-scraper/requirements.txt


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/gcp/project.tfvars:
--------------------------------------------------------------------------------
1 | project_id = "kagglex-llm-demo"
2 | region = "europe-west1"
3 | zone = "europe-west1-b"
4 | 


--------------------------------------------------------------------------------
/requirements/llm-poc-variant-01.txt:
--------------------------------------------------------------------------------
1 | langchain==0.0.261
2 | chromadb==0.3.26
3 | joblib
4 | tqdm==4.65.0
5 | sentence_transformers==2.2.2
6 | 


--------------------------------------------------------------------------------
/app/Jorge_Rocha_campos_ML - Google Docs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neomatrix369/learning-path-index/HEAD/app/Jorge_Rocha_campos_ML - Google Docs.pdf


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/gcp/chainlit-app-demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neomatrix369/learning-path-index/HEAD/app/llm-poc-variant-01/deploy/gcp/chainlit-app-demo.gif


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/aws/keypair.tf:
--------------------------------------------------------------------------------
1 | resource "aws_key_pair" "lpi-key" {
2 |   key_name   = "lpi-key"
3 |   public_key = file("~/.ssh/lpi-key.pub")
4 |   tags = {
5 |     Name = "lpi-key"
6 |   }
7 | }


--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
 1 | cfgv==3.4.0
 2 | distlib==0.3.8
 3 | filelock==3.16.1
 4 | identify==2.6.1
 5 | nodeenv==1.9.1
 6 | platformdirs==4.3.6
 7 | pre_commit==4.0.0
 8 | PyYAML==6.0.2
 9 | virtualenv==20.26.6
10 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-02/.env_template:
--------------------------------------------------------------------------------
1 | # This file won't become part of the git history as long as it exists in
2 | # the .gitignore file, and it should stay like that
3 | OPENAI_API_KEY=<yourOpenAIAPI key>
4 | PINECONE_API_KEY=<yourPineCone key>


--------------------------------------------------------------------------------
/app/llm-poc-variant-02/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain==0.0.216
2 | streamlit==1.27.2
3 | tqdm==4.65.0
4 | # Pre-requisites: [sudo] apt install libopenblas-base libomp-dev
5 | # See https://github.com/onfido/faiss_prebuilt
6 | faiss-cpu==1.7.4
7 | faiss-gpu==1.7.2


--------------------------------------------------------------------------------
/requirements/llm-poc-variant-02.txt:
--------------------------------------------------------------------------------
1 | langchain==0.0.216
2 | streamlit==1.27.2
3 | tqdm==4.65.0
4 | # Pre-requisites: [sudo] apt install libopenblas-base libomp-dev
5 | # See https://github.com/onfido/faiss_prebuilt
6 | faiss-cpu==1.7.4
7 | faiss-gpu==1.7.2
8 | 


--------------------------------------------------------------------------------
/requirements/scraper.txt:
--------------------------------------------------------------------------------
 1 | annotated-types==0.5.0
 2 | certifi==2023.7.22
 3 | charset-normalizer==3.2.0
 4 | idna==3.4
 5 | lxml==4.9.3
 6 | pydantic==1.9.2
 7 | pydantic_core==2.10.1
 8 | requests==2.31.0
 9 | typing_extensions==4.8.0
10 | urllib3==2.0.5
11 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM --platform="linux/amd64" python:3.10-bookworm
 2 | 
 3 | COPY . .
 4 | 
 5 | ARG REQUESTS_CA_BUNDLE
 6 | ENV REQUESTS_CA_BUNDLE="${REQUESTS_CA_BUNDLE:-}"
 7 | 
 8 | RUN pip install -r requirements.txt
 9 | 
10 | ENTRYPOINT ["/bin/bash"]


--------------------------------------------------------------------------------
/app/course-scraper/src/config.py:
--------------------------------------------------------------------------------
1 | class CONFIG:
2 |     # XXX: Modify as needed
3 |     DATA_PATH = r'data'
4 | 
5 |     GCSB_JOURNEY_URL = 'https://www.cloudskillsboost.google/journeys/17'
6 | 
7 |     # CHROME_USER_DATA_DIR = r"C:\Users\user\AppData\Local\Google\Chrome\User Data"
8 |     # CHROME_USER = "Default"
9 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/gcp/ollama.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Ollama Service
 3 | After=network-online.target
 4 | 
 5 | [Service]
 6 | ExecStart=/usr/bin/ollama serve
 7 | User=ollama
 8 | Group=ollama
 9 | Restart=always
10 | RestartSec=3
11 | Environment="PATH=$PATH"
12 | 
13 | [Install]
14 | WantedBy=default.target
15 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/aws/provider.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     ### AWS: https://registry.terraform.io/providers/hashicorp/aws/latest
 4 |     aws = {
 5 |       source  = "hashicorp/aws"
 6 |       version = "~> 5.26.0"
 7 |     }
 8 |   }
 9 | }
10 | 
11 | provider "aws" {
12 |   region  = "eu-central-1"
13 |   profile = "default"
14 | }


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/constants.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from chromadb.config import Settings
 4 | 
 5 | # Define the folder for storing database
 6 | PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY', 'vector_db')
 7 | 
 8 | # Define the Chroma settings
 9 | CHROMA_SETTINGS = Settings(
10 |     chroma_db_impl='duckdb+parquet',
11 |     persist_directory=PERSIST_DIRECTORY,
12 |     anonymized_telemetry=False,
13 | )
14 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff]
 2 | line-length = 88
 3 | select = [
 4 |     "C",  # mccabe rules
 5 |     "F",  # pyflakes rules
 6 |     "E",  # pycodestyle error rules
 7 |     "W",  # pycodestyle warning rules
 8 |     "B",  # flake8-bugbear rules
 9 |     "I",  # isort rules
10 | ]
11 | ignore = [
12 |     "C901",  # max-complexity-10
13 |     "E501",  # line-too-long
14 | ]
15 | 
16 | [tool.ruff.format]
17 | indent-style = "space"
18 | quote-style = "single"
19 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.5.0
 4 |     hooks:
 5 |       - id: check-toml
 6 |       - id: check-yaml
 7 |       - id: end-of-file-fixer
 8 |       - id: trailing-whitespace
 9 | 
10 |   - repo: https://github.com/astral-sh/ruff-pre-commit
11 |     rev: v0.1.5
12 |     hooks:
13 |       - id: ruff
14 |         args: [--fix, --exit-non-zero-on-fix, --show-fixes]
15 |       - id: ruff-format
16 | 


--------------------------------------------------------------------------------
/app/course-scraper/src/gsheet.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | logger = logging.getLogger(__name__)
 4 | 
 5 | 
 6 | def connect_to_gsheet(sheet_id: str):
 7 |     # Connect to a Google Sheet
 8 |     ...
 9 | 
10 | 
11 | class GSheetWriter:
12 |     """
13 |     TODO: Emulate the behaviour of csv.writer and csv.DictWriter,
14 |     but instead write to a Google Sheet
15 |     """
16 | 
17 |     ...
18 | 
19 | 
20 | class GSheetReader:
21 |     """
22 |     TODO: Emulate the behaviour of csv.reader and csv.DictReader,
23 |     but instead write to a Google Sheet
24 |     """
25 | 
26 |     ...
27 | 
28 | 
29 | # Write a new row
30 | 


--------------------------------------------------------------------------------
/app/llm-gemma-variant/src/test.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | from dotenv import load_dotenv
 3 | from llama_index.core import Settings
 4 | from loguru import logger
 5 | from llama_index.llms.ollama import Ollama
 6 | from model import Gemma
 7 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 8 | from vector_db import VectorDB
 9 | from pathlib import Path
10 | 
11 | # Initialize the model
12 | gemma2_2b = Ollama(model="gemma2:2b", request_timeout=60.0)
13 | logger.debug(gemma2_2b.complete("Hello, how are you?"))
14 | llm_model = Gemma(gemma2_2b, 2000)
15 | Settings.llm = llm_model
16 | 
17 | print(llm_model.complete(prompt = "Hello, how are you?"))


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/aws/security_groups.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_security_group" "lpi-sg" {
 2 |   name        = "lpi-sg"
 3 |   description = "LPI Security Group"
 4 | 
 5 |   ingress {
 6 |     description      = "SSH"
 7 |     from_port        = 22
 8 |     to_port          = 22
 9 |     protocol         = "tcp"
10 |     cidr_blocks      = ["0.0.0.0/0"]
11 |     ipv6_cidr_blocks = ["::/0"]
12 |   }
13 | 
14 |   egress {
15 |     from_port        = 0
16 |     to_port          = 0
17 |     protocol         = "-1"
18 |     cidr_blocks      = ["0.0.0.0/0"]
19 |     ipv6_cidr_blocks = ["::/0"]
20 |   }
21 | 
22 |   tags = {
23 |     Name = "lpi-sg"
24 |   }
25 | }


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/kaggle_learn/pages.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Selectors for different HTML pages
 3 | Plural attributes imply a list of elements are returned by the xpath,
 4 | rather than a single element
 5 | """
 6 | 
 7 | 
 8 | class KaggleLearnCourseListPage:
 9 |     """
10 |     Page found at https://www.kaggle.com/learn
11 |     """
12 | 
13 |     courses = "//section[@data-testid='course-catalog']//li[@role='listitem']"
14 |     course_link = "//a/@href"
15 |     course_description = "//span/text()"
16 |     course_title = "//span/preceding-sibling::div/text()"
17 | 
18 | 
19 | class KaggleLearnCourseDetailPage:
20 |     """
21 |     E.g https://www.kaggle.com/learn/intro-to-programming
22 |     """
23 | 
24 |     ...
25 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-02/interface.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | # Define your Streamlit app and return the input variable   
 4 | def app():
 5 |     # Add a title to your app
 6 |     st.title("KaggleX Learning Path Index Search")
 7 | 
 8 |     # Add some text to your app
 9 |     st.write("Embark your Learning Path Journey with right search !!")
10 | 
11 |     # Add a text input to your app
12 |     user_input = st.text_input("Enter your course query here")
13 | 
14 |     # Store the input in a variable
15 |     my_variable = user_input
16 |     # Display the stored variable
17 |     # st.write(f"The stored variable is: {my_variable}")
18 |     
19 |     return my_variable
20 | 
21 | # Run your Streamlit app
22 | # if __name__ == "__main__":
23 | #     var = app()
24 | #     print(var)


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/gcp/provider.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   backend "gcs" {
 3 |     bucket= "llm-project-sbx-tf-state"
 4 |     prefix = "static.tfstate.d"
 5 |   }
 6 | 
 7 |   required_providers {
 8 |     ### GCP: https://registry.terraform.io/providers/hashicorp/google/latest
 9 |     google = {
10 |       source  = "hashicorp/google"
11 |       version = "~> 4.0"
12 |     }
13 |   }
14 | }
15 | 
16 | variable "project_id" {
17 |   type = string
18 |   description = "The ID of the GCP project"
19 | }
20 | 
21 | variable "region" {
22 |   type = string
23 |   description = "The region of the GCP project"
24 | }
25 | 
26 | variable "zone" {
27 |   type = string
28 |   description = "The zone of the GCP project"
29 | }
30 | 
31 | provider "google" {
32 |   project = var.project_id
33 |   region  = var.region
34 | }
35 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/gcp/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "self_link" {
 2 |   description = "The self link of the instance"
 3 |   value       = try(google_compute_instance.lpi-cpu-vm.self_link, "")
 4 | }
 5 | 
 6 | output "network_interface_0_access_config_0_nat_ip" {
 7 |   description = "The external IP address assigned to the instance"
 8 |   value       = try(google_compute_instance.lpi-cpu-vm.network_interface[0].access_config[0].nat_ip, "")
 9 | }
10 | 
11 | output "network_interface_0_network_ip" {
12 |   description = "The internal IP address assigned to the instance"
13 |   value       = try(google_compute_instance.lpi-cpu-vm.network_interface[0].network_ip, "")
14 | }
15 | 
16 | output "tags" {
17 |   description = "A map of tags assigned to the resource"
18 |   value       = try(google_compute_instance.lpi-cpu-vm.tags, {})
19 | }
20 | 


--------------------------------------------------------------------------------
/app/llm-gemma-variant/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "llm-gemma-variant"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Tacoman99 <jorgepadres789@gmail.com>"]
 6 | readme = "README.md"
 7 | packages = [{include = "llm_gemma_variant", from = "src"}]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.11"
11 | llama-index-vector-stores-weaviate = "^1.1.3"
12 | llama-index = "^0.11.21"
13 | weaviate-client = "^4.9.3"
14 | python-dotenv = "^1.0.1"
15 | llama-index-llms-nvidia = "^0.2.6"
16 | loguru = "^0.7.2"
17 | torch = "^2.5.1"
18 | llama-index-llms-ollama = "^0.3.4"
19 | llama-index-llms-huggingface-api = "^0.2.0"
20 | llama-index-llms-huggingface = "^0.3.5"
21 | llama-index-embeddings-huggingface = "^0.3.1"
22 | 
23 | 
24 | [build-system]
25 | requires = ["poetry-core"]
26 | build-backend = "poetry.core.masonry.api"
27 | 


--------------------------------------------------------------------------------
/app/course-scraper/src/utils.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | 
 3 | SAFECHARS = string.ascii_lowercase + string.ascii_uppercase + string.digits + ".-"
 4 | 
 5 | 
 6 | def get_safestring(string: str):
 7 |     return "".join([c for c in string if c in SAFECHARS])
 8 | 
 9 | 
10 | def find_element_by_xpath(dom, xpath):
11 |     return dom.xpath(xpath)[0]
12 | 
13 | 
14 | def find_elements_by_xpath(dom, xpath):
15 |     return dom.xpath(xpath)
16 | 
17 | 
18 | def login_selenium_driver_to_gcb(driver: "WebDriver"):
19 |     from scrapers.google_cloud_skill_boost import pages
20 | 
21 |     driver.get(CONFIG.GCB_LOGIN_URL)
22 |     print(driver.title)
23 |     driver.find_element("xpath", pages.GCSBSignInPage.user_email).send_keys(
24 |         CONFIG.GCB_EMAIL
25 |     )
26 |     driver.find_element("xpath", pages.GCSBSignInPage.user_password).send_keys(
27 |         CONFIG.GCB_PASSWORD
28 |     )
29 |     driver.find_element("xpath", pages.GCSBSignInPage.sign_in_button).click()
30 | 


--------------------------------------------------------------------------------
/data/utils/get-kaggle-dataset-meta-data.py:
--------------------------------------------------------------------------------
 1 | # Original code: https://lindevs.com/get-dataset-metadata-from-kaggle-using-api-and-python/
 2 | 
 3 | import os
 4 | import json
 5 | from pprint import pprint
 6 | 
 7 | from kaggle.api.kaggle_api_extended import KaggleApi
 8 | 
 9 | owner = 'neomatrix369'
10 | datasetName = 'learning-path-index-dataset'
11 | 
12 | api = KaggleApi()
13 | api.authenticate()
14 | 
15 | print(f"\nFetching the metadata of {owner}/{datasetName}")
16 | metadata = api.metadata_get(owner, datasetName)
17 | 
18 | print(f"\nPrinting the metadata of {owner}/{datasetName}")
19 | pprint(metadata)
20 | 
21 | metadata_filename = "../dataset-metadata.json"
22 | metadata_file = open(metadata_filename, "w")
23 | try:
24 |     metadata_as_str = json.dumps(metadata, indent=2) ### Formats the JSON when saving it
25 |     metadata_file.write(metadata_as_str)
26 |     print(f"\nSaving the metadata to {metadata_filename}")
27 | finally:
28 |     metadata_file.close()
29 | 


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/google_cloud_skill_boost/models.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import Optional
 3 | from typing import Union
 4 | 
 5 | 
 6 | class Activity(BaseModel):
 7 |     """
 8 |     For clarity, during visualization, skip the intermediate model
 9 |     and show CourseSubmodule - Activity relationships
10 |     """
11 | 
12 |     id: str
13 |     href: Optional[str]
14 |     duration: Union[int, float]
15 |     title: str
16 |     type: str
17 | 
18 | 
19 | class CourseStep(BaseModel):
20 |     id: str
21 |     isOptional: bool
22 |     activities: list[Activity]  # Usually has one activity, containing the actual title
23 |     allActivitiesRequired: bool
24 | 
25 | 
26 | class CourseSubmodule(BaseModel):
27 |     id: str
28 |     title: str
29 |     description: Optional[str]
30 |     steps: list[CourseStep]
31 |     expanded: bool
32 | 
33 |     def __hash__(self) -> int:
34 |         return int(self.id)
35 | 
36 | 
37 | class Course(BaseModel):
38 |     __root__: list[CourseSubmodule]  # __root__ == 🌟
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 mani
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-02/faiss_index.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from langchain.text_splitter import CharacterTextSplitter
 4 | from langchain.embeddings.openai import OpenAIEmbeddings
 5 | from langchain.document_loaders import TextLoader
 6 | from langchain.vectorstores import FAISS
 7 | from langchain.chains import RetrievalQA
 8 | from langchain.llms import OpenAI
 9 | 
10 | def faiss_index():
11 |     current_directory = os.getcwd()
12 |     data_path = current_directory + "\\final_project\\Learning_Pathway_Index.csv"
13 |     loader = TextLoader(data_path)
14 |     documents = loader.load()
15 |     text_splitter = CharacterTextSplitter(
16 |         chunk_size=1000, chunk_overlap=30, separator="\n"
17 |     )
18 |     docs = text_splitter.split_documents(documents=documents)
19 | 
20 |     embeddings = OpenAIEmbeddings()
21 |     vectorstore = FAISS.from_documents(docs, embeddings)
22 |     vectorstore.save_local("faiss_learning_path_index")
23 | 
24 |     new_vectorstore = FAISS.load_local("faiss_learning_path_index", embeddings)
25 |     qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=new_vectorstore.as_retriever())
26 |     res = qa.run("Give me Machine Learning Course with 10 or 20 min duration.")
27 |     print(res)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     faiss_index()
32 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/docker/run-docker-container.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | set -u
 5 | set -o pipefail
 6 | 
 7 | DOCKER_USER_NAME="neomatrix369"
 8 | FULL_DOCKER_TAG_NAME="python-3.10-docker-env"
 9 | echo "Running image ${FULL_DOCKER_TAG_NAME}"; echo ""
10 | 
11 | pullImage() {
12 | 	FULL_DOCKER_TAG_NAME="${DOCKER_USER_NAME}/${FULL_DOCKER_TAG_NAME}"
13 | 
14 | 	docker pull ${FULL_DOCKER_TAG_NAME} || true
15 | }
16 | 
17 | WORKDIR="/home/"
18 | LOCAL_MODEL_FOLDER="$(pwd)/../"
19 | MODEL_VOLUME_SHARED="--volume ${LOCAL_MODEL_FOLDER}:${WORKDIR}"
20 | OLLAMA_VOLUME_SHARED="--volume $(which ollama):/usr/bin/ollama"
21 | HF_CACHE_SHARED="--volume ${LOCAL_MODEL_FOLDER}/.cache:/root/.cache"
22 | 
23 | set -x
24 | 
25 | # pullImage
26 | time docker run --rm  -it --network="host" \
27 |                 --platform="linux/amd64"   \
28 |                 --network="host"           \
29 |                 --add-host=host.docker.internal:host-gateway \
30 |                 --workdir "${WORKDIR}"     \
31 |                 --env OLLAMA_HOST="http://host.docker.internal:11434" \
32 |                 ${HF_CACHE_SHARED}         \
33 |                 ${MODEL_VOLUME_SHARED}     \
34 |                 ${OLLAMA_VOLUME_SHARED}    \
35 |                 "${FULL_DOCKER_TAG_NAME}"
36 | set +x
37 | 
38 | echo "* Finished running docker image ${FULL_DOCKER_TAG_NAME}"


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/aws/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "arn" {
 2 |   description = "The ARN of the instance"
 3 |   value       = try(aws_instance.lpi-cpu-vm.arn, "")
 4 | }
 5 | 
 6 | output "public_dns" {
 7 |   description = "This public DNS name assigned to the instance"
 8 |   value       = try(aws_instance.lpi-cpu-vm.public_dns, "")
 9 | }
10 | 
11 | output "public_ip" {
12 |   description = "This public IP address assigned to the instance"
13 |   value       = try(aws_instance.lpi-cpu-vm.public_ip, "")
14 | }
15 | 
16 | output "private_dns" {
17 |   description = "This private DNS name assigned to the instance"
18 |   value       = try(aws_instance.lpi-cpu-vm.private_dns, "")
19 | }
20 | 
21 | output "private_ip" {
22 |   description = "This private IP address assigned to the instance"
23 |   value       = try(aws_instance.lpi-cpu-vm.private_ip, "")
24 | }
25 | 
26 | 
27 | # Outputs the id of the subnet you created in the module
28 | #output "subnet_id" {
29 | #  value = try(aws_subnet.this.id, "")
30 | #}
31 | 
32 | # Outputs the value of the 
33 | # /aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2 parameter.
34 | #output "ami_id" {
35 | #  value = try(data.aws_ssm_parameter.this.value, "")
36 | #}
37 | 
38 | output "tags_all" {
39 |   description = "A map of tags assigned to the resource"
40 |   value       = try(aws_instance.lpi-cpu-vm.tags_all, "")
41 | }
42 | 


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/google_cloud_skill_boost/pages.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Selectors for different HTML pages
 3 | Plural attributes imply a list of elements are returned by the xpath,
 4 | rather than a single element
 5 | """
 6 | 
 7 | 
 8 | # NOTE: GCSB = Google Cloud Skill Boost
 9 | class GCSBSignInPage:
10 |     user_email = '//input[@id="user_email"]'
11 |     user_password = '//input[@id="user_password"]'
12 |     sign_in_button = '//button[@data-analytics-action="clicked_sign_in"]'
13 | 
14 | 
15 | class GCSBLearningJourneyPage:
16 |     """
17 |     E.g https://www.cloudskillsboost.google/journeys/183)
18 |     """
19 | 
20 |     journeys = "//div[@class='activity-card']"
21 |     journey_title = ".//h2[2]/text()"
22 |     journey_details = ".//div[@class='activity-details']//div[contains(@class, 'ql-subhead-1')]/text()"
23 |     journey_description = ".//p/text()"
24 |     journey_link = ".//ql-button[contains(text(), 'Learn more')]/@href"
25 | 
26 | 
27 | class GCSBCourseTemplatePage:
28 |     """
29 |     Skill Boost Course page
30 |     E.g https://www.cloudskillsboost.google/course_templates/541
31 |     """
32 | 
33 |     course_title = "//h1[@class='ql-headline-1']"
34 |     prework = "(//div[div/text() = 'Prerequisites'])/following-sibling::div/text()"
35 | 
36 | 
37 | class GCSBFocusPage:
38 |     """
39 |     E.g https://www.cloudskillsboost.google/focuses/71938?parent=catalog
40 |     """
41 | 
42 |     ...
43 | 


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/kaggle_learn/README.md:
--------------------------------------------------------------------------------
 1 | # Kaggle Learn Course Scraper
 2 | 
 3 | This folder contains scripts for scraping courses from the Kaggle Learn website using the Kaggle Learn API. The scripts retrieve course information in JSON format without the need for HTML parsing.
 4 | 
 5 | ## Scripts
 6 | 
 7 | 1. **`scrape_all_courses.py`**: This script makes use of the Kaggle Learn API to scrape all available courses from the platform in a single API request. It's the recommended script to use when scraping a comprehensive list of courses.
 8 | 
 9 | 2. **`scrape_course.py`**: This script is provided for illustrative purposes. It demonstrates how to scrape course information using the Kaggle Learn API on a per-course basis.
10 | 
11 | ## Getting Started
12 | 
13 | To get started with course scraping, you can choose between the two scripts mentioned above based on your requirements.
14 | 
15 | ### Prerequisites
16 | 
17 | Make sure you have Python installed on your system.
18 | 
19 | ### Usage
20 | 
21 | 1. Clone this repository
22 | 
23 | 2. Navigate to the repository folder:
24 | 
25 | ```bash
26 | cd course-scraper/src
27 | ```
28 | 
29 | 3. Run the desired script:
30 | 
31 | ```bash
32 | python -m scrapers.kaggle_learn.scrape_all_courses
33 | ```
34 | 
35 | or
36 | 
37 | ```bash
38 | python -m scrapers.kaggle_learn.scrape_all_course
39 | ```
40 | 
41 | ## Config
42 | 
43 | The folder where the output is stored can be changed by modifying `course-scraper/config.py`


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/kaggle_learn/scrape_course.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Scrape Courses from Kaggle Learn
 3 | This script makes use of the internal Kaggle Learn API to retrieve course information
 4 | Without parsing any HTML
 5 | """
 6 | 
 7 | from pathlib import Path
 8 | 
 9 | import requests
10 | from config import CONFIG
11 | from scrapers.kaggle_learn.models import KaggleCourse
12 | 
13 | KAGGLE_COURSE_API_URL = (
14 |     "https://www.kaggle.com/api/i/education.EducationService/GetTrack"
15 | )
16 | 
17 | KAGGLE_DATA_PATH = Path(CONFIG.DATA_PATH, "KaggleLearnCourses")
18 | 
19 | 
20 | def get_course_details(url: str) -> dict:
21 |     """
22 |     Get details of a Kaggle Learn course
23 |     e.g https://www.kaggle.com/learn/feature-engineering
24 |     """
25 |     session = requests.Session()
26 |     # Make a preparatory request to get relevant cookies
27 |     session.get(url)
28 |     xsrf_token = session.cookies.get("XSRF-TOKEN")
29 |     track_slug = url.split("/")[-1]
30 |     r = session.post(
31 |         KAGGLE_COURSE_API_URL,
32 |         headers={"X-Xsrf-Token": xsrf_token, "Content-Type": "application/json"},
33 |         json={"trackSlug": track_slug},
34 |     )
35 | 
36 |     return r.json()
37 | 
38 | 
39 | course = KaggleCourse.parse_obj(
40 |     get_course_details("https://www.kaggle.com/learn/feature-engineering")
41 | )
42 | 
43 | with open(
44 |     KAGGLE_DATA_PATH.joinpath("feature-engineering-course.csv"), "w", encoding="utf-8"
45 | ) as f:
46 |     course.write_course_summary_to_file(f)
47 | 


--------------------------------------------------------------------------------
/app/llm-gemma-variant/src/model.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from llama_index.core.callbacks import CallbackManager
 4 | from llama_index.core.llms import (
 5 |     CustomLLM,
 6 |     CompletionResponse,
 7 |     CompletionResponseGen,
 8 |     LLMMetadata,
 9 | )
10 | from llama_index.core.llms.callbacks import llm_completion_callback
11 | from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, PromptTemplate
12 | 
13 | # This creates a custom llm for more control over our model 
14 | class Gemma(CustomLLM):
15 |     num_output: int = 8192 
16 |     model_name: str = "Gemma"
17 |     model: Any = None
18 | 
19 |     def __init__(self, model, num_output):
20 |         super(Gemma, self).__init__()
21 |         self.model = model
22 |         self.num_output = num_output
23 | 
24 |     @property
25 |     def metadata(self) -> LLMMetadata:
26 |         """Get LLM metadata."""
27 |         return LLMMetadata(
28 |             num_output=self.num_output,
29 |             model_name=self.model_name,
30 |         )
31 | 
32 |     @llm_completion_callback()
33 |     def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
34 |         return self.model.complete(prompt, max_length=self.num_output)
35 | 
36 |     @llm_completion_callback()
37 |     def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
38 |         response = ""
39 |         for token in self.model.generate(prompt, max_length=self.num_output):
40 |             response += token
41 |             yield CompletionResponse(text=response, delta=token)


--------------------------------------------------------------------------------
/data/utils/README.md:
--------------------------------------------------------------------------------
 1 | # Get Metadata for a Kaggle Dataset
 2 | 
 3 | The `get-kaggle-dataset-meta-data.py` python script fetches the metadata for the [_Learning Path Index_ Kaggle Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset).
 4 | 
 5 | ## Pre-requisites
 6 | 
 7 | - Python 3.10 or higher
 8 | - Docker (to run inside docker containers)
 9 | - Shell-scripting (basic skills)
10 | - Kaggle
11 | 
12 | **Steps**
13 | 
14 | - Setup your `.bashrc` or `.zshrc` or Windows environment with the below environment variables:
15 | 
16 | ```bash
17 | export KAGGLE_USERNAME="[your kaggle username]"
18 | export KAGGLE_KEY="[your kaggle API key]"
19 | ```
20 | 
21 | - See [Kaggle API Docs](https://www.kaggle.com/docs/api) the lastest docs, but for specific queries like 
22 | [How to Obtain a Kaggle API Key](https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/#generate-your-kaggle-api-key), [Christian Mill's Kaggle Docs](https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/) are also useful.
23 | 
24 | - Install dependencies by running:
25 | 
26 | ```bash
27 | pip install requirements.txt
28 | ```
29 | 
30 | ## Usage
31 | 
32 | ```bash
33 | cd [into this folder]
34 | python get-kaggle-dataset-meta-data.py
35 | ```
36 | 
37 | This creates the metadata json file in the parent folder by the name `dataset-metadata.json`.
38 | 
39 | ## Docs
40 | 
41 | - [Kaggle API docs](https://www.kaggle.com/docs/api)
42 | - [Dataset Metadata](https://github.com/Kaggle/kaggle-api/wiki/Dataset-Metadata) 
43 | - [Kaggle Wiki](https://github.com/Kaggle/kaggle-api/wiki)
44 | 


--------------------------------------------------------------------------------
/app/llm-gemma-variant/src/backend.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | from dotenv import load_dotenv
 3 | from llama_index.core import Settings
 4 | from loguru import logger
 5 | from llama_index.llms.ollama import Ollama
 6 | from model import Gemma
 7 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 8 | from vector_db import VectorDB
 9 | from pathlib import Path
10 | 
11 | 
12 | 
13 | def main():
14 |     # Load and verify API key
15 |     load_dotenv('credentials.env')
16 | 
17 |     # Initialize the model
18 |     gemma2_2b = Ollama(model="gemma2:2b")
19 |     llm_model = Gemma(gemma2_2b, 2000)
20 |     Settings.llm = llm_model
21 | 
22 |     # Initialize embedding model
23 |     Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
24 | 
25 |     # Getting data path
26 |     current_directory = os.getcwd()
27 |     data_path = os.path.join(current_directory, r"LPI_folder/Learning_Pathway_Index.csv")
28 | 
29 |     # Initialize the VectorDB class
30 |     weaviate_vector_db = VectorDB(
31 |         data_path=data_path,
32 |         index_name="Learning_path_index"
33 |     )
34 |     
35 |     # Create the vector database
36 |     index = weaviate_vector_db.vector_db_creation()
37 | 
38 |     # Initialize RAG
39 |     naive_rag_query_engine = index.as_query_engine()
40 | 
41 |     # Run your naive RAG query
42 |     response = naive_rag_query_engine.query("What courses should I take if i want to learn about finetuning?")
43 | 
44 |     logger.info(response.response)
45 | 
46 |     # Disconnect from the Weaviate vector database
47 |     weaviate_vector_db.disconnect()
48 | 
49 | if __name__ == "__main__":
50 |     main()
51 | 
52 | 


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/kaggle_learn/scrape_all_courses.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Scrape Courses from Kaggle Learn
 3 | This script makes use of the internal Kaggle Learn API to retrieve course information
 4 | Without parsing any HTML
 5 | """
 6 | 
 7 | from pathlib import Path
 8 | 
 9 | import requests
10 | from config import CONFIG
11 | from pydantic import BaseModel
12 | from scrapers.kaggle_learn.models import (
13 |     KaggleCourse,
14 | )
15 | from utils import get_safestring
16 | 
17 | KAGGLE_COURSE_API_URL = (
18 |     "https://www.kaggle.com/api/i/education.EducationService/GetTracks"
19 | )
20 | 
21 | KAGGLE_DATA_PATH = Path(CONFIG.DATA_PATH, "KaggleLearnCourses")
22 | KAGGLE_DATA_PATH.mkdir(exist_ok=True, parents=True)
23 | 
24 | 
25 | def get_page_details(url: str) -> dict:
26 |     """
27 |     Get all courses and their details from Kaggle Learn Homepage https://www.kaggle.com/learn/
28 |     """
29 |     session = requests.Session()
30 |     # Make a preparatory request to get relevant cookies
31 |     session.get(url)
32 |     xsrf_token = session.cookies.get("XSRF-TOKEN")
33 |     r = session.post(
34 |         KAGGLE_COURSE_API_URL,
35 |         headers={"X-Xsrf-Token": xsrf_token, "Content-Type": "application/json"},
36 |         json={},
37 |     )
38 | 
39 |     return r.json()
40 | 
41 | 
42 | class AllKaggleCourses(BaseModel):
43 |     tracks: list[KaggleCourse]
44 | 
45 | 
46 | page = AllKaggleCourses.parse_obj(
47 |     get_page_details("https://www.kaggle.com/learn/feature-engineering")
48 | )
49 | 
50 | for course in page.tracks:
51 |     file_name = get_safestring(course.name)
52 |     with open(
53 |         KAGGLE_DATA_PATH.joinpath(f"{file_name}.csv"), "w", encoding="utf-8"
54 |     ) as f:
55 |         course.write_course_summary_to_file(f)
56 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/aws/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_instance" "lpi-cpu-vm" {
 2 |   ami = "ami-065deacbcaac64cf2" //Ubuntu AMI
 3 |   ### https://aws.amazon.com/ec2/instance-types/
 4 |   ###### t2.xlarge = CPU based
 5 |   instance_type = "t2.2xlarge"
 6 |   ebs_block_device {
 7 |     device_name = "/dev/sda1"
 8 |     volume_size = 20
 9 |   }
10 | 
11 |   tags = {
12 |     Name = "LPI Instance (CPU/vm)"
13 |   }
14 | 
15 |   key_name        = aws_key_pair.lpi-key.key_name
16 |   security_groups = ["lpi-sg"]
17 | 
18 |   connection {
19 |     type = "ssh"
20 |     ### Important to set this to the correct user, as for AMI Ubuntu/Linux boxes
21 |     ### the default name is 'ubuntu', and NOT 'ec2-user'
22 |     user        = "ubuntu"
23 |     private_key = var.ssh_private_key
24 |     password    = ""
25 |     host        = self.public_ip
26 |   }
27 | 
28 |   provisioner "remote-exec" {
29 |     inline = [
30 |       "sudo apt-get update -y",
31 |       "sudo apt install -y ca-certificates curl gnupg lsb-release",
32 |       "sudo apt-get update -y",
33 |       "curl -fsSL https://get.docker.com -o get-docker.sh",
34 |       "sudo sh get-docker.sh",
35 |       "sudo groupadd -f docker",
36 |       "sudo usermod -aG docker $USER",
37 |       "docker -v || true",
38 |       "curl https://ollama.ai/install.sh | sh",
39 |       "ollama serve",
40 |       "ollama pull llama2-uncensored",
41 |       "echo; ollama list; echo",
42 |       "git clone https://github.com/neomatrix369/learning-path-index",
43 |       "cd learning-path-index/app/llm-poc-variant-01/",
44 |       "mkdir -p source_documents",
45 |       "curl https://raw.githubusercontent.com/neomatrix369/learning-path-index/main/data/Learning_Pathway_Index.csv -o 'source_documents/Learning Pathway Index.csv'"
46 |     ]
47 |   }
48 | }


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/google_cloud_skill_boost/README.md:
--------------------------------------------------------------------------------
 1 | # Getting Started
 2 | 
 3 | 1. **Setup Your Virtual Environment**
 4 |    - Create a new virtual environment:
 5 | 
 6 |    ```bash
 7 |    python -m venv venv
 8 |    source venv/bin/activate
 9 | 
10 |    #Windows
11 |    venv/Scripts/activate
12 |    ```
13 | 
14 | 2. **Install Dependencies**
15 | 
16 | - Change to the appropriate directory
17 | 
18 |    ```bash
19 |    cd C:/{path}/learning-path-index/app/course-scraper
20 |    ```
21 | 
22 | 
23 | - Run the following command to install the required dependencies:
24 | 
25 |    ```bash
26 |    pip install -r requirements.txt
27 |    ```
28 | 
29 | 3. **Run the Scripts**
30 | 
31 |    - **Scrape a Journey** (Run this first for each journey)
32 |      - Example:
33 |      To scrape the ML Engineer Path (https://www.cloudskillsboost.google/journeys/183)
34 |      modify the config variables in `scrape_journey.py` and run
35 |      ```bash
36 |      python -m scrapers.google_cloud_skill_boost.scrape_journey
37 |      ```
38 | 
39 |    - **Scrape a Course Template**
40 |      - Example:
41 |     To scrape the details of all the courses in the ML Engineer Path (Details of Learning Paths are termed course templates e.g https://www.cloudskillsboost.google/course_templates/541),
42 |      ```bash
43 |      python -m scrapers.scrapers.google_cloud_skill_boost.scrape_course_template
44 |      ```
45 | 
46 |    - **TODO: Scrape a Lab/Focus**
47 |      - Example:
48 |     To scrape the details of a lab (An example lab is https://www.cloudskillsboost.google/focuses/71938?parent=catalog)
49 |      ```bash
50 |      python -m scrapers.scrapers.google_cloud_skill_boost.scrape_focus
51 |      ```
52 | 
53 | ## Configuration
54 | You can modify most of the scraping behavior and parameters by editing the `config.py` file.
55 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-02/README.md:
--------------------------------------------------------------------------------
 1 | # kagglex-final-project
 2 | 
 3 | A prototype written in Python to illustrate/demonstrate querying the Learning Path Index Dataset (see [Kaggle Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset) and [GitHub repo](https://github.com/neomatrix369/learning-path-index)), with the help of the OpenAI GPT technology (InstructHPT model and embeddings model), [Langchain](https://python.langchain.com/) and using [Facebook's FAISS library](https://faiss.ai/).
 4 | 
 5 | 
 6 | ![image](https://github.com/mbhoge/kagglex-final-project/assets/988040/5396aee3-cf0f-43b6-9b44-aaf779ed803a)
 7 | 
 8 | 
 9 | The end-to-end process can be learnt by going through the code base as well as by observing the console logs when using both the Streamlit and the CLI versions.
10 | 
11 | ## Pre-requisites
12 | 
13 | - Python 3.8.x or above
14 | - OpenAI API Key (see [How to get an OpenAI API Key](https://www.howtogeek.com/885918/how-to-get-an-openai-api-key/) -- note it's may not be FREE anymore)
15 | - Install dependencies from `requirements.txt`
16 | - Basic Command-line experience
17 | - Basic git and GitHub experience
18 | 
19 | ## Install and run
20 | 
21 | Copy the `.env_template` to `.env` in the current folder and then add your OpenAI API Key to `.env`. 
22 | **Please don't modify the `.env_template` file.**
23 | 
24 | 
25 | ```bash
26 | pip install -r requirements.txt
27 | ```
28 | 
29 | ### Interactive session via CLI app
30 | 
31 | ```bash
32 | python main.py
33 | ```
34 | 
35 | ![image](https://github.com/mbhoge/kagglex-final-project/assets/1570917/9bb04765-623d-452a-bcd0-82abf74ce6a9)
36 | 
37 | 
38 | ### Interactive session via Streamlit app
39 | 
40 | ```bash
41 | streamlit run main.py
42 | ```
43 | 
44 | ![image](https://github.com/mbhoge/kagglex-final-project/assets/1570917/714eabc6-90bf-4e48-bf45-f2c8a307bf5a)
45 | 
46 | ---
47 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/docker/build-docker-image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | set -u
 5 | set -o pipefail
 6 | 
 7 | DOCKER_USER_NAME="${DOCKER_USER_NAME:-neomatrix369}"
 8 | FULL_DOCKER_TAG_NAME="python-3.10-docker-env"
 9 | 
10 | cleanup() {
11 | 	containersToRemove=$(docker ps --quiet --filter "status=exited")
12 | 	[ ! -z "${containersToRemove}" ] && \
13 | 	    echo "Remove any stopped container from the local registry" && \
14 | 	    docker rm ${containersToRemove} || true
15 | 
16 | 	imagesToRemove=$(docker images --quiet --filter "dangling=true")
17 | 	[ ! -z "${imagesToRemove}" ] && \
18 | 	    echo "Remove any dangling images from the local registry" && \
19 | 	    docker rmi -f ${imagesToRemove} || true
20 | }
21 | 
22 | pushImageToHub() {
23 | 	echo "Pushing image ${FULL_DOCKER_TAG_NAME} to Docker Hub"; echo ""
24 | 
25 | 	if ! docker login --username=${DOCKER_USER_NAME}; then
26 | 		echo "Failed to login to Docker Hub"
27 | 		exit 1
28 | 	fi
29 | 	pushImage ${FULL_DOCKER_TAG_NAME}
30 | }
31 | 
32 | findImage() {
33 | 	IMAGE_NAME="${1}"
34 | 	echo $(docker images ${IMAGE_NAME} -q | head -n1 || true)
35 | }
36 | 
37 | pushImage() {
38 | 	IMAGE_NAME="${1}"
39 | 	FULL_DOCKER_TAG_NAME="${DOCKER_USER_NAME}/${IMAGE_NAME}"
40 | 
41 | 	IMAGE_FOUND="$(findImage ${IMAGE_NAME})"
42 | 	IS_FOUND="found"
43 | 	if [[ -z "${IMAGE_FOUND}" ]]; then
44 | 		IS_FOUND="not found"
45 | 	fi
46 | 	echo "Docker image '${DOCKER_USER_NAME}/${IMAGE_NAME}' is ${IS_FOUND} in the local repository"
47 | 
48 | 	docker tag ${IMAGE_FOUND} ${FULL_DOCKER_TAG_NAME}
49 | 	docker push ${FULL_DOCKER_TAG_NAME}
50 | }
51 | 
52 | 
53 | echo "Building image ${FULL_DOCKER_TAG_NAME}"; echo ""
54 | 
55 | WORKDIR="/home/"
56 | cleanup
57 | 
58 | cp ../requirements.txt .
59 | set -x
60 | REQUESTS_CA_BUNDLE="$(ls *.pem || true)"
61 | time docker build                              \
62 |                 --build-arg WORKDIR=${WORKDIR} \
63 |                 --build-arg REQUESTS_CA_BUNDLE=${REQUESTS_CA_BUNDLE} \
64 |                 -t ${FULL_DOCKER_TAG_NAME}     \
65 |                 .
66 | set +x
67 | rm -f requirements.txt
68 | 
69 | echo "* Finished building docker image ${FULL_DOCKER_TAG_NAME}"
70 | 
71 | pushImageToHub
72 | 
73 | cleanup


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/gcp/main.tf:
--------------------------------------------------------------------------------
 1 | resource "google_compute_instance" "lpi-cpu-vm" {
 2 |   name         = "lpi-llm-cpu"
 3 |   machine_type = "n2-standard-8"
 4 |   zone         = var.zone
 5 | 
 6 |   boot_disk {
 7 |     initialize_params {
 8 |       image = "ubuntu-os-cloud/ubuntu-2404-lts-amd64"
 9 |       size  = 30
10 |     }
11 |   }
12 | 
13 |   network_interface {
14 |     network = "default"
15 | 
16 |     access_config {
17 |       // Include this section to give the VM an external IP address
18 |     }
19 |   }
20 | 
21 |   metadata_startup_script = <<-EOF
22 |     sudo apt-get update -y
23 |     sudo apt install -y ca-certificates curl gnupg lsb-release
24 |     sudo apt-get update -y
25 | 
26 |     sudo apt install python3-pip python3.12-venv -y
27 | 
28 |     # Install Docker
29 |     curl -fsSL https://get.docker.com -o get-docker.sh
30 |     sudo sh get-docker.sh
31 |     sudo groupadd -f docker
32 |     sudo usermod -aG docker $USER
33 |     docker -v || true
34 | 
35 |     # Install Ollama
36 |     curl https://ollama.ai/install.sh | sh
37 | 
38 |     # Add a startup service for Ollama
39 |     sudo useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama
40 |     sudo usermod -a -G ollama $(whoami)
41 |     cp ollama.service /etc/systemd/system/ollama.service
42 |     sudo systemctl daemon-reload
43 |     sudo systemctl enable ollama
44 | 
45 |     sudo systemctl start ollama
46 |     sudo systemctl status ollama
47 | 
48 |     ollama pull gemma:2b
49 | 
50 |     sudo git clone https://github.com/neomatrix369/learning-path-index /learning-path-index
51 |     sudo chmod ugo+rwx /learning-path-index
52 |     git config --global --add safe.directory /learning-path-index
53 |     cd /learning-path-index/app/llm-poc-variant-01/
54 |     git checkout gcp-terraform-deploy
55 | 
56 |     mkdir -p source_documents
57 |     curl https://raw.githubusercontent.com/neomatrix369/learning-path-index/main/data/Learning_Pathway_Index.csv -o 'source_documents/Learning Pathway Index.csv'
58 | 
59 |     python3 -m venv venv
60 |     . venv/bin/activate
61 |     python3 -m pip install -r requirements.txt
62 | 
63 |     chainlit run chainlit_app.py --host 0.0.0.0 --port 8000
64 |     EOF
65 | 
66 |   tags = [
67 |     "lpi-sg",
68 |     "http-server",
69 |     "https-server"
70 |   ]
71 | }
72 | 


--------------------------------------------------------------------------------
/app/course-scraper/README.md:
--------------------------------------------------------------------------------
 1 | # Course Scraper Module
 2 | 
 3 | The Course Scraper Module is a versatile tool designed to fetch course information and duration from various online learning platforms. It simplifies the process of finding the right resource for your learning needs by providing a unified interface for accessing course details from supported platforms.
 4 | 
 5 | ## Supported Platforms
 6 | 
 7 | - [x] Google Developer Courses
 8 | - [x] Fast.ai ML Course
 9 | - [x] IBM - AI & Ethics Course
10 | - [ ] Google Cloud Skill Boost: Machine Learning Engineer
11 | - [x] Google Cloud Skill Boost: Data Learning Engineer
12 | - [ ] Google Cloud Skill Boost: Data Analyst
13 | - [x] Google Cloud Skill Boost: Generative AI
14 | - [ ] Google Cloud Skill Boost: AD-HOC Courses
15 | - [ ] [Kaggle Learn Courses](./src/scrapers/kaggle_learn)
16 | - [ ] Deeplearning.ai Courses
17 | 
18 | ## Getting Started
19 | 
20 | To get started with the Course Scraper Module, follow these steps:
21 | 
22 | 1. **Clone the Repository:**
23 | 
24 |    Clone this GitHub repository to your local machine:
25 | 
26 | 
27 | 2. **Navigate to the Course Scraper Module:**
28 | 
29 |    Change your current working directory to the course-scraper subfolder within the cloned repository:
30 | 
31 |    ```bash
32 |    cd learning-path-index/course-scraper
33 |    ```
34 | 
35 | 3. **Install Dependencies:**
36 | 
37 |    Ensure you have all the required dependencies installed. You can do this using pip:
38 | 
39 |    ```bash
40 |    pip install -r requirements.txt
41 |    ```
42 | 
43 | 4. **Run the Scraper:**
44 | 
45 |    Scrapers specific to each platform can be found in `course-scraper/src/scrapers` folder.
46 |    Would you like to scrape courses from *Kaggle Learn*?
47 |    Checkout the [Kaggle Learn scraper README.md]().
48 |    How about *Google Cloud Skill Boost*?
49 |    Checkout the [GCSB scraper README.md]().
50 |    
51 | 
52 |    Generally scrapers can be run by navigating to the `course-scraper/src` folder, and running
53 |    ```bash
54 |    python -m scrapers.<course_platform>.<specific_script>
55 |    ```
56 | 
57 |    e.g
58 | 
59 |    ```bash
60 |    python -m scrapers.kaggle_learn.scrape_all_courses
61 |    ```
62 | 
63 | 
64 | 5. **View the Results:**
65 | 
66 |    The scraper will provide the course details and duration in a structured format. In the folder determined by `config.py`
67 | 
68 | ## Usage
69 | 
70 | ### Scraper Configuration
71 | 
72 | You can configure the general behaviour of all scrapers by modifying the `config.py` file. This file allows you to specify:
73 |  -  output location ✅
74 |  -  the output format (TODO: 🚧), 
75 | 
76 | and other settings.
77 | 
78 | ## Contributing
79 | 
80 | We welcome contributions to enhance and expand the Course Scraper Module. If you'd like to contribute, please follow these guidelines:
81 | 
82 | 1. Fork the repository.
83 | 2. Create a new branch for your feature or bug fix.
84 | 3. Make your changes and ensure that the code passes all tests.
85 | 4. Submit a pull request with a clear description of your changes and their purpose.
86 | 
87 | 
88 | Happy learning and scraping!


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/google_cloud_skill_boost/scrape_journey.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import csv
 3 | import os
 4 | from csv import DictWriter
 5 | from pathlib import Path
 6 | from urllib.parse import urljoin
 7 | 
 8 | import requests
 9 | from config import CONFIG
10 | from lxml import etree
11 | from scrapers.google_cloud_skill_boost import pages
12 | 
13 | COURSE_CODE = 'CLMML11'
14 | GCSB_HOME_URL = 'https://www.cloudskillsboost.google/'
15 | GCSB_LOGIN_URL = 'https://www.cloudskillsboost.google/users/sign_in'
16 | 
17 | DATA_FOLDER = Path(CONFIG.DATA_PATH, COURSE_CODE)
18 | DATA_FOLDER.mkdir(exist_ok=True, parents=True)
19 | 
20 | 
21 | # Open Journey Path
22 | def extract_ml_learning_path(GCSB_JOURNEY_URL) -> list[dict]:
23 |     r = requests.get(GCSB_JOURNEY_URL)
24 |     dom = etree.fromstring(r.content, etree.HTMLParser())
25 | 
26 |     data = []
27 |     for journey in dom.xpath(pages.GCSBLearningJourneyPage.journeys):
28 |         details = journey.xpath(pages.GCSBLearningJourneyPage.journey_details)
29 |         details = details[0] if details else 'No details available'
30 | 
31 |         link = journey.xpath(pages.GCSBLearningJourneyPage.journey_link)
32 |         link = urljoin(GCSB_HOME_URL, link[0]) if link else 'No link available'
33 | 
34 |         data.append(
35 |             {
36 |                 'title': journey.xpath(pages.GCSBLearningJourneyPage.journey_title)[
37 |                     0
38 |                 ].strip()
39 |                 if journey.xpath(pages.GCSBLearningJourneyPage.journey_title)
40 |                 else 'No title available',
41 |                 'details': details.strip(),
42 |                 'description': journey.xpath(
43 |                     pages.GCSBLearningJourneyPage.journey_description
44 |                 )[0].strip()
45 |                 if journey.xpath(pages.GCSBLearningJourneyPage.journey_description)
46 |                 else 'No description available',
47 |                 'link': link,
48 |             }
49 |         )
50 | 
51 |     return data
52 | 
53 | 
54 | parser = argparse.ArgumentParser(description='Extract ML learning path')
55 | parser.add_argument('--url', help='GCSB Journey URL')
56 | args = parser.parse_args()
57 | 
58 | GCSB_JOURNEY_URL = (
59 |     args.url
60 |     or os.getenv('GCSB_JOURNEY_URL')
61 |     or CONFIG.GCSB_JOURNEY_URL
62 |     or input('Please enter the GCSB Journey URL: ')
63 | )
64 | 
65 | data = extract_ml_learning_path(GCSB_JOURNEY_URL)
66 | 
67 | if data:
68 |     try:
69 |         with open(
70 |             DATA_FOLDER.joinpath(f'{COURSE_CODE}-Courses.csv'),
71 |             'w',
72 |             encoding='utf-8',
73 |             newline='',
74 |         ) as f:
75 |             DictWriter(
76 |                 f, fieldnames=['title', 'details', 'description', 'link']
77 |             ).writerows(data)
78 |         print(f'Data successfully written to {COURSE_CODE}-Courses.csv')
79 |     except IOError as e:
80 |         print(f'An I/O error occurred while writing the file: {e}')
81 |     except csv.Error as e:
82 |         print(f'A CSV-related error occurred: {e}')
83 |     except Exception as e:
84 |         print(f'An unexpected error occurred while writing the file: {e}')
85 | else:
86 |     print('No data to write!')
87 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/chainlit_app.py:
--------------------------------------------------------------------------------
 1 | import chainlit as cl
 2 | from chainlit.input_widget import Select, Slider
 3 | from constants import PERSIST_DIRECTORY
 4 | from langchain.prompts import ChatPromptTemplate
 5 | from langchain.schema import StrOutputParser
 6 | from langchain.schema.runnable import RunnablePassthrough
 7 | from langchain.schema.runnable.config import RunnableConfig
 8 | from lpiGPT import build_model, build_retriever
 9 | 
10 | 
11 | def build_runnable_from_settings(settings: dict):
12 |     retriever = build_retriever(
13 |         model_embeddings=settings['ModelEmbeddings'],
14 |         persist_directory=PERSIST_DIRECTORY,
15 |     )
16 |     _qa, llm = build_model(retriever, model_name=settings['ModelName'])
17 |     prompt = ChatPromptTemplate.from_messages(
18 |         [
19 |             (
20 |                 'system',
21 |                 """
22 |             You are an assistant for question-answering tasks using the Learning Path Index.
23 |             Show the results in a table or tabular form, and the results must contain a link for each line of the courses, modules or sub-modules returned.
24 |             """,
25 |             ),
26 |             ('human', '{question}'),
27 |         ]
28 |     )
29 | 
30 |     runnable = (
31 |         {'context': retriever | format_docs, 'question': RunnablePassthrough()}
32 |         | prompt
33 |         | llm
34 |         | StrOutputParser()
35 |     )
36 |     return runnable
37 | 
38 | 
39 | @cl.on_settings_update
40 | async def setup_agent(settings):
41 |     await cl.Message(
42 |         content='Updating settings',
43 |     ).send()
44 |     runnable = build_runnable_from_settings(settings)
45 |     cl.user_session.set('settings', settings)
46 |     cl.user_session.set('runnable', runnable)
47 | 
48 | 
49 | def format_docs(docs):
50 |     return '\n\n'.join(doc.page_content for doc in docs)
51 | 
52 | 
53 | @cl.on_chat_start
54 | async def on_chat_start():
55 |     settings = await cl.ChatSettings(
56 |         [
57 |             Select(
58 |                 id='ModelName',
59 |                 label='Chat Model',
60 |                 values=['gemma:2b', 'llama2-uncensored'],
61 |                 initial_index=0,
62 |             ),
63 |             Select(
64 |                 id='ModelEmbeddings',
65 |                 label='Model Embeddings',
66 |                 values=['all-MiniLM-L6-v2'],
67 |                 initial_index=0,
68 |             ),
69 |             Slider(
70 |                 id='TargetSourceChunks',
71 |                 label='Target Source Chunks',
72 |                 initial=500,
73 |                 min=250,
74 |                 max=2000,
75 |                 step=50,
76 |             ),
77 |         ]
78 |     ).send()
79 |     runnable = build_runnable_from_settings(settings)
80 | 
81 |     cl.user_session.set('settings', settings)
82 |     cl.user_session.set('runnable', runnable)
83 | 
84 | 
85 | @cl.on_message
86 | async def on_message(message: cl.Message):
87 |     runnable = cl.user_session.get('runnable')
88 | 
89 |     msg = cl.Message(content='')
90 | 
91 |     for chunk in await cl.make_async(runnable.stream)(
92 |         message.content,
93 |         config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
94 |     ):
95 |         await msg.stream_token(chunk)
96 | 
97 |     await msg.send()
98 | 


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/google_cloud_skill_boost/scrape_course_template.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import io
 3 | from collections import defaultdict
 4 | from html import unescape
 5 | from pathlib import Path
 6 | 
 7 | import requests
 8 | from config import CONFIG
 9 | from lxml import etree
10 | from scrapers.google_cloud_skill_boost.models import Activity, Course, CourseSubmodule
11 | from utils import get_safestring
12 | 
13 | COURSE_CODE = "CLMML11"
14 | 
15 | DATA_FOLDER = Path(CONFIG.DATA_PATH, COURSE_CODE)
16 | DATA_FOLDER.mkdir(exist_ok=True, parents=True)
17 | 
18 | course_modules_mapping = {}
19 | with open(DATA_FOLDER.joinpath(f"{COURSE_CODE}-Courses.csv")) as f:
20 |     course_meta = io.StringIO(f.read())
21 | 
22 | csvreader = csv.DictReader(course_meta)
23 | for course in csvreader:
24 |     # TODO: Support scraping GCB Labs
25 |     if "labs" in course["title"].lower():
26 |         continue
27 |     r = requests.get(course["link"])
28 |     print(str(r.content)[:100])
29 |     html_parser = etree.HTMLParser()
30 |     dom = etree.fromstring(r.content, html_parser)
31 | 
32 |     prerequisites = None
33 |     if prerequisites := dom.xpath(
34 |         "(//div[div/text() = 'Prerequisites'])/following-sibling::div/text()"
35 |     ):
36 |         prerequisites = "".join(prerequisites[0]).replace("\n", " ")
37 | 
38 |     if course_modules := dom.xpath("//ql-course/@modules"):
39 |         course_modules = course_modules[0]
40 |         course_modules = unescape(course_modules)
41 |     else:
42 |         continue
43 | 
44 |     course_modules_mapping[course["title"]] = course_modules
45 | 
46 |     with open(
47 |         DATA_FOLDER.joinpath(f"{COURSE_CODE}-Modules-Meta.csv"),
48 |         "a",
49 |         encoding="utf-8",
50 |     ) as f:
51 |         csvwriter = csv.writer(f)
52 |         print(prerequisites)
53 |         csvwriter.writerow([course["title"], course["link"], prerequisites])
54 | 
55 | for course_title, course_module in course_modules_mapping.items():
56 |     parsed_courses = [c for c in Course.parse_raw(course_module).__root__]
57 |     submodule_activities: dict[CourseSubmodule, list[dict]] = defaultdict(list)
58 | 
59 |     # Link Submodules with their activities
60 |     for submodule in parsed_courses:
61 |         for step in submodule.steps:
62 |             submodule_activities[submodule].extend(
63 |                 [activity.dict() for activity in step.activities]
64 |             )
65 | 
66 |     course_title = get_safestring(course_title)
67 | 
68 |     for submodule in submodule_activities:
69 |         submodule_title = get_safestring(submodule.title)
70 |         with open(
71 |             DATA_FOLDER.joinpath(f"{submodule_title}.csv"),
72 |             "w",
73 |             encoding="utf-8",
74 |         ) as f:
75 |             fieldnames = Activity.__fields__.keys()
76 |             csvwriter = csv.DictWriter(f, fieldnames)
77 |             csvwriter.writeheader()
78 |             csvwriter.writerows(submodule_activities[submodule])
79 | 
80 |     with open(DATA_FOLDER.joinpath(f"{course_title}.csv"), "w", encoding="utf-8") as f:
81 |         excluded_fields = {"steps", "expanded"}
82 |         fieldnames = sorted(set(CourseSubmodule.__fields__.keys()) - excluded_fields)
83 |         csvwriter = csv.DictWriter(f, fieldnames)
84 |         csvwriter.writeheader()
85 |         csvwriter.writerows(
86 |             [submodule.dict(exclude=excluded_fields) for submodule in parsed_courses]
87 |         )
88 | 


--------------------------------------------------------------------------------
/app/course-scraper/src/scrapers/kaggle_learn/models.py:
--------------------------------------------------------------------------------
  1 | from csv import writer
  2 | from typing import Optional
  3 | from urllib.parse import urljoin
  4 | 
  5 | from pydantic import BaseModel, validator
  6 | 
  7 | KAGGLE_LEARN_URL = "https://www.kaggle.com/learn/"
  8 | KAGGLE_URL = "https://www.kaggle.com"
  9 | 
 10 | EMPTY_CSV_ROW = []
 11 | 
 12 | 
 13 | def convert_relative_url_to_absolute(
 14 |     relative_url: str, domain: str = KAGGLE_LEARN_URL
 15 | ) -> str:
 16 |     return urljoin(domain, relative_url)
 17 | 
 18 | 
 19 | class KaggleLesson(BaseModel):
 20 |     class KaggleTutorial(BaseModel):
 21 |         name: str
 22 |         url: str  # E.g "/code/ryanholbrook/what-is-feature-engineering"
 23 |         authorUsername: str
 24 | 
 25 |         @validator("url", each_item=True)
 26 |         def convert_to_absolute_url(cls, url):
 27 |             return convert_relative_url_to_absolute(url, domain=KAGGLE_URL)
 28 | 
 29 |     description: str
 30 |     learnTutorial: KaggleTutorial
 31 | 
 32 | 
 33 | class KagglePrerequsite(BaseModel):
 34 |     name: str
 35 |     trackSlug: str
 36 | 
 37 |     @validator("trackSlug", each_item=True)
 38 |     def convert_to_absolute_url(cls, trackSlug):
 39 |         return convert_relative_url_to_absolute(trackSlug, domain=KAGGLE_LEARN_URL)
 40 | 
 41 | 
 42 | class KaggleAuthor(BaseModel):
 43 |     displayName: str
 44 |     userName: str
 45 | 
 46 | 
 47 | class KaggleCourse(BaseModel):
 48 |     name: str
 49 |     description: str
 50 |     estimatedTimeHours: int
 51 |     trackSlug: str
 52 |     lessons: list[KaggleLesson]
 53 |     prerequisites: Optional[list[KagglePrerequsite]]
 54 |     authors: list[KaggleAuthor]
 55 | 
 56 |     @validator("trackSlug", each_item=True)
 57 |     def convert_to_absolute_url(cls, trackSlug):
 58 |         return convert_relative_url_to_absolute(trackSlug, domain=KAGGLE_LEARN_URL)
 59 | 
 60 |     @property
 61 |     def processed_authors(self):
 62 |         return ",".join(
 63 |             [f"{author.userName}|{author.displayName}" for author in self.authors]
 64 |         )
 65 | 
 66 |     def write_course_summary_to_file(self, f):
 67 |         csvwriter = writer(f)
 68 |         csvwriter.writerow(["name", "description", "duration", "url", "authors"])
 69 |         csvwriter.writerows(
 70 |             [
 71 |                 [
 72 |                     self.name,
 73 |                     self.description,
 74 |                     self.estimatedTimeHours,
 75 |                     self.trackSlug,
 76 |                     self.processed_authors,
 77 |                 ],
 78 |                 EMPTY_CSV_ROW,
 79 |             ]
 80 |         )
 81 | 
 82 |         if self.prerequisites:
 83 |             # Write prerequisites
 84 |             csvwriter.writerow(
 85 |                 ["prerequisites"],
 86 |             )
 87 |             csvwriter.writerows(
 88 |                 [[p.name, p.trackSlug] for p in self.prerequisites] + EMPTY_CSV_ROW
 89 |             )
 90 | 
 91 |         # Write lessons
 92 |         csvwriter.writerow(["lessons"])
 93 |         csvwriter.writerow(["name", "description", "url", "authorUserName"])
 94 |         csvwriter.writerows(
 95 |             [
 96 |                 [
 97 |                     lesson.learnTutorial.name,
 98 |                     lesson.description,
 99 |                     lesson.learnTutorial.url,
100 |                     lesson.learnTutorial.authorUsername,
101 |                 ]
102 |                 for lesson in self.lessons
103 |             ]
104 |         )
105 |         csvwriter.writerow(EMPTY_CSV_ROW)
106 | 


--------------------------------------------------------------------------------
/getting-started.md:
--------------------------------------------------------------------------------
  1 | # Getting Started
  2 | 
  3 | This guide will help you set up and run the Learning Path Index (LPI) project on your local machine.
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | Before you begin, ensure you have the following installed:
  8 | 
  9 | - **Python (tested with version 3.x):** Download and install Python from [the official website](https://www.python.org/downloads/).
 10 | - **Git:** Download and install Git from [the official website](https://git-scm.com/downloads).
 11 | - **Docker (optional):** While not strictly necessary, Docker simplifies running certain components. Install Docker following the [instructions for your operating system](https://docs.docker.com/get-docker/).
 12 | 
 13 | ## Installation
 14 | 
 15 | The LPI project consists of several independent applications (applets) that work together:
 16 | 
 17 | - **Web scraper**
 18 | - **LLM Variant 01 (Ollama)**
 19 | - **LLM Variant 02 (OpenAI)**
 20 | 
 21 | **Note:** This guide focuses on setting up the general repository. Each applet may have additional instructions.
 22 | 
 23 | ### Step 1: Clone the Repository
 24 | 
 25 | Clone the repository to your local machine using the following command:
 26 | 
 27 | ```bash
 28 | git clone https://github.com/neomatrix369/learning-path-index.git learning-path-index
 29 | 
 30 | cd learning-path-index
 31 | ```
 32 | 
 33 | ### Step 2: Install Dependencies
 34 | 
 35 | Python dependencies for each applet are located in the `./requirements/` directory, with each applet having a separate file. Here's how to install them using a virtual environment:
 36 | 
 37 | #### Create a virtual environment:
 38 | 
 39 | ```bash
 40 | python -m venv venv
 41 | ```
 42 | 
 43 | #### Activate the virtual environment
 44 | 
 45 | MacOS/Linux
 46 | 
 47 | ```bash
 48 | . venv/bin/activate
 49 | ```
 50 | 
 51 | Windows
 52 | 
 53 | ```powershell
 54 | venv\Scripts\activate
 55 | ```
 56 | 
 57 | #### Install base dependencies
 58 | 
 59 | ```bash
 60 | pip install -r requirements/base.txt
 61 | ```
 62 | 
 63 | #### Install dependencies for specific applets
 64 | 
 65 | Each applet may have additional dependencies. Look for a requirements.txt file within the directory for the specific applet (e.g., requirements/scraper.txt) and install them using:
 66 | 
 67 | ```bash
 68 | pip install -r requirements/<applet_name>.txt
 69 | ```
 70 | 
 71 | Replace `<applet_name>` with the actual name of the applet (e.g., `scraper`).
 72 | 
 73 | ### Step 3: Setup pre-commit hooks
 74 | 
 75 | Pre-commit hooks automate tasks like code formatting and linting. Install them using:
 76 | 
 77 | ```bash
 78 | pre-commit install
 79 | ```
 80 | 
 81 | ### Step 4: Setup the Applets
 82 | 
 83 | Each applet has its own setup and usage instructions. Refer to the documentation specific to each applet for detailed guidance on:
 84 | 
 85 | - Web scraper: [Installation instructions and usage guide](app/course-scraper/README.md).
 86 | 
 87 | - LLM Variant 01 (Ollama): [Instructions on setting up and using Ollama](app/llm-poc-variant-01/deploy/aws/README.md).
 88 | 
 89 | - LLM Variant 02 (OpenAI): [Instructions on creating an OpenAI account and API keys](app/llm-poc-variant-02/README.md).
 90 | 
 91 | **Tip**: Look for additional documentation files within the directory for each applet.
 92 | 
 93 | ## Troubleshooting
 94 | 
 95 | Here are some common issues or errors that you might face:
 96 | 
 97 | - Dependency Conflicts: Ensure that your dependencies are up to date and consistent with the versions specified in the requirements directory.
 98 | - OpenAI Rate limit error: The free tier of OpenAI has limitations on API calls. Consider upgrading to a paid account if you frequently encounter this error.
 99 | 
100 | For further assistance, refer to the project's documentation (if available) or reach out to the project maintainers on the GitHub repository.
101 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/requirements.txt:
--------------------------------------------------------------------------------
  1 | aiofiles==23.2.1
  2 | aiohappyeyeballs==2.4.3
  3 | aiohttp==3.11.6
  4 | aiosignal==1.3.1
  5 | anyio==4.6.2.post1
  6 | asgiref==3.8.1
  7 | asyncer==0.0.7
  8 | attrs==24.2.0
  9 | backoff==2.2.1
 10 | bcrypt==4.2.1
 11 | bidict==0.23.1
 12 | build==1.2.2.post1
 13 | cachetools==5.5.0
 14 | certifi==2024.8.30
 15 | chainlit==1.1.404
 16 | charset-normalizer==3.4.0
 17 | chevron==0.14.0
 18 | chroma-hnswlib==0.7.6
 19 | chromadb==0.5.20
 20 | click==8.1.7
 21 | coloredlogs==15.0.1
 22 | dataclasses-json==0.5.14
 23 | Deprecated==1.2.15
 24 | durationpy==0.9
 25 | fastapi==0.110.3
 26 | filelock==3.16.1
 27 | filetype==1.2.0
 28 | flatbuffers==24.3.25
 29 | frozenlist==1.5.0
 30 | fsspec==2024.10.0
 31 | google-auth==2.36.0
 32 | googleapis-common-protos==1.66.0
 33 | greenlet==3.1.1
 34 | grpcio==1.68.0
 35 | h11==0.14.0
 36 | httpcore==1.0.7
 37 | httptools==0.6.4
 38 | httpx==0.27.2
 39 | huggingface-hub==0.26.2
 40 | humanfriendly==10.0
 41 | idna==3.10
 42 | importlib_metadata==8.5.0
 43 | importlib_resources==6.4.5
 44 | Jinja2==3.1.4
 45 | joblib==1.4.2
 46 | kubernetes==31.0.0
 47 | langchain==0.0.261
 48 | langsmith==0.0.92
 49 | Lazify==0.4.0
 50 | literalai==0.0.607
 51 | markdown-it-py==3.0.0
 52 | MarkupSafe==3.0.2
 53 | marshmallow==3.23.1
 54 | mdurl==0.1.2
 55 | mmh3==5.0.1
 56 | monotonic==1.6
 57 | mpmath==1.3.0
 58 | multidict==6.1.0
 59 | mypy-extensions==1.0.0
 60 | nest-asyncio==1.6.0
 61 | networkx==3.4.2
 62 | numexpr==2.10.1
 63 | numpy==1.26.4
 64 | nvidia-cublas-cu12==12.4.5.8
 65 | nvidia-cuda-cupti-cu12==12.4.127
 66 | nvidia-cuda-nvrtc-cu12==12.4.127
 67 | nvidia-cuda-runtime-cu12==12.4.127
 68 | nvidia-cudnn-cu12==9.1.0.70
 69 | nvidia-cufft-cu12==11.2.1.3
 70 | nvidia-curand-cu12==10.3.5.147
 71 | nvidia-cusolver-cu12==11.6.1.9
 72 | nvidia-cusparse-cu12==12.3.1.170
 73 | nvidia-nccl-cu12==2.21.5
 74 | nvidia-nvjitlink-cu12==12.4.127
 75 | nvidia-nvtx-cu12==12.4.127
 76 | oauthlib==3.2.2
 77 | onnxruntime==1.20.0
 78 | openapi-schema-pydantic==1.2.4
 79 | opentelemetry-api==1.28.2
 80 | opentelemetry-exporter-otlp==1.28.2
 81 | opentelemetry-exporter-otlp-proto-common==1.28.2
 82 | opentelemetry-exporter-otlp-proto-grpc==1.28.2
 83 | opentelemetry-exporter-otlp-proto-http==1.28.2
 84 | opentelemetry-instrumentation==0.49b2
 85 | opentelemetry-instrumentation-asgi==0.49b2
 86 | opentelemetry-instrumentation-fastapi==0.49b2
 87 | opentelemetry-proto==1.28.2
 88 | opentelemetry-sdk==1.28.2
 89 | opentelemetry-semantic-conventions==0.49b2
 90 | opentelemetry-util-http==0.49b2
 91 | orjson==3.10.11
 92 | overrides==7.7.0
 93 | packaging==23.2
 94 | pillow==11.0.0
 95 | posthog==3.7.2
 96 | propcache==0.2.0
 97 | protobuf==5.28.3
 98 | pyasn1==0.6.1
 99 | pyasn1_modules==0.4.1
100 | pydantic==1.10.19
101 | Pygments==2.18.0
102 | PyJWT==2.10.0
103 | PyPika==0.48.9
104 | pyproject_hooks==1.2.0
105 | python-dateutil==2.9.0.post0
106 | python-dotenv==1.0.1
107 | python-engineio==4.10.1
108 | python-multipart==0.0.9
109 | python-socketio==5.11.4
110 | PyYAML==6.0.2
111 | regex==2024.11.6
112 | requests==2.32.3
113 | requests-oauthlib==2.0.0
114 | rich==13.9.4
115 | rsa==4.9
116 | safetensors==0.4.5
117 | scikit-learn==1.5.2
118 | scipy==1.14.1
119 | sentence-transformers==3.3.1
120 | setuptools==75.6.0
121 | shellingham==1.5.4
122 | simple-websocket==1.1.0
123 | six==1.16.0
124 | sniffio==1.3.1
125 | SQLAlchemy==2.0.36
126 | starlette==0.37.2
127 | sympy==1.13.1
128 | syncer==2.0.3
129 | tenacity==8.5.0
130 | threadpoolctl==3.5.0
131 | tokenizers==0.20.3
132 | tomli==2.1.0
133 | torch==2.5.1
134 | tqdm==4.67.0
135 | transformers==4.46.3
136 | triton==3.1.0
137 | typer==0.13.1
138 | typing-inspect==0.9.0
139 | typing_extensions==4.12.2
140 | uptrace==1.28.2
141 | urllib3==2.2.3
142 | uvicorn==0.25.0
143 | uvloop==0.21.0
144 | watchfiles==0.20.0
145 | websocket-client==1.8.0
146 | websockets==14.1
147 | wrapt==1.16.0
148 | wsproto==1.2.0
149 | yarl==1.17.2
150 | zipp==3.21.0
151 | 


--------------------------------------------------------------------------------
/app/llm-gemma-variant/src/vector_db.py:
--------------------------------------------------------------------------------
  1 | from llama_index.core import SimpleDirectoryReader
  2 | from llama_index.readers.file import CSVReader
  3 | import weaviate
  4 | from loguru import logger
  5 | import pandas as pd
  6 | from llama_index.core import VectorStoreIndex, StorageContext
  7 | from llama_index.vector_stores.weaviate  import WeaviateVectorStore
  8 | 
  9 | 
 10 | class VectorDB:
 11 |     """
 12 |     Create a weaviate vector database from a the Learning Path Index csv file.
 13 |     """
 14 |     def __init__(  
 15 |         self,
 16 |         data_path: str,
 17 |         index_name: str,
 18 |     ):
 19 |         """
 20 |         Initialize the VectorDB class.
 21 |         
 22 |         Args:
 23 |             data_path: str, path to the Learning Path Index csv file.
 24 |             index_name: str, name of the index to create.   
 25 |         Output:
 26 |             None
 27 |         """
 28 |         self.data_path = data_path
 29 |         self.index_name = index_name
 30 | 
 31 |         try:
 32 |             self.client = weaviate.connect_to_embedded()
 33 |         except weaviate.exceptions.WeaviateConnectionError as e:
 34 |             raise ConnectionError(f"Failed to connect to Weaviate: {str(e)}")
 35 | 
 36 |     def disconnect(self):
 37 |         """
 38 |         Disconnect from the Weaviate vector database.
 39 |         """
 40 |         if self.client:
 41 |             self.client.close()  # Assuming the client has a close method
 42 |             logger.info("Disconnected from the Weaviate vector database.")
 43 |         else:
 44 |             logger.warning("No active connection to disconnect.")
 45 | 
 46 |     
 47 |     def LPI_loader(self):
 48 |         """
 49 |         Load the Learning Path Index csv file.
 50 |         """
 51 |         # Load data  csv file
 52 |         df = pd.read_csv(self.data_path)
 53 | 
 54 |         # Use the CSVReader to load the data and load each row as a document by setting concat_rows=False
 55 |         parser = CSVReader(concat_rows=False)
 56 |         file_extractor = {".csv": parser}  # Add other CSV formats as needed
 57 |          
 58 |         # Load the documents
 59 |         documents = SimpleDirectoryReader(
 60 |             input_files = [self.data_path], file_extractor=file_extractor
 61 |             ).load_data()
 62 | 
 63 |         logger.debug(documents[1])
 64 | 
 65 | 
 66 |         # Adding Metadata to the documents
 67 |         for i, row in df.iterrows():
 68 |             metadata = {
 69 |                 'source': row['Source'],
 70 |                 'course': row['Course_Learning_Material'],
 71 |                 'module': row['Module']
 72 |             }
 73 |             documents[i + 1].metadata = metadata
 74 |         
 75 |         return documents
 76 | 
 77 |     def vector_db_creation(self):
 78 |         """
 79 |         Create a weaviate vector database from the Learning Path Index csv file.
 80 |         """
 81 |         documents = self.LPI_loader()
 82 |         
 83 | 
 84 |         logger.info(f"Connected to the weaviate embedded instance: {self.client.is_ready()}")
 85 | 
 86 |         # Create the vector database
 87 |         vector_store = WeaviateVectorStore(
 88 |             weaviate_client = self.client, 
 89 |             index_name = self.index_name
 90 |         )
 91 | 
 92 |         # Set up the storage for the embeddings
 93 |         storage_context = StorageContext.from_defaults(vector_store=vector_store)
 94 |         # Setup the index
 95 |         # build VectorStoreIndex that takes care of chunking documents
 96 |         # and   encoding chunks to embeddings for future retrieval
 97 | 
 98 |         logger.info(f"Creating the {self.index_name} index")
 99 |         index = VectorStoreIndex.from_documents(
100 |             documents, storage_context=storage_context
101 |         )
102 |         logger.info(f"The {self.index_name} index has been created")
103 |         return index
104 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .chainlit/
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | .idea/
162 | 
163 | # Security
164 | *.pem
165 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/aws/README.md:
--------------------------------------------------------------------------------
  1 | # Deployment: Infrastructure as code
  2 | 
  3 | The scripts in this folder give the ability to provision and manage compute capacity using [AWS Infrastructure]([link to follow]), in order to deploy the docker container and run the app in it. 
  4 | 
  5 | In short the scripts does the below:
  6 | - [instructions to follow]
  7 | 
  8 | **Table of content**
  9 | - [Pre-requisites](#pre-requisites)
 10 | - [Provisioning Infrastructure using Terraform](#provisioning-infrastructure-using-terraform)
 11 |   + [Create infrastructure from the CLI using Terraform](#create-infrastructure-from-the-cli-using-terraform)
 12 |   + [Deploy the docker image with the notebooks and libraries](#deploy-the-docker-image-with-the-notebooks-and-libraries)
 13 |   + [Destroy infrastructure (cleanup)](#destroy-infrastructure-cleanup)
 14 | - [Security](#security)
 15 | 
 16 | ## Pre-requisites
 17 | 
 18 | - [AWS & Relates stuff]
 19 | - [Install Terraform](https://learn.hashicorp.com/terraform/getting-started/install.html) (all methods for the various platforms are mentioned)
 20 | - Clone this repo and in the right folder:
 21 | ```bash
 22 | $ git clone https://github.com/neomatrix369/learning-path-index/
 23 | $ cd learning-path-index
 24 | $ cd app/llm-poc-variant-01/deploy/aws
 25 | ```
 26 | 
 27 | For a summary (also helps to verify the steps) of the above steps please see [here](https://www.terraform.io/docs/providers/aws/index.html).
 28 | 
 29 | ## Provisioning Infrastructure using Terraform
 30 | 
 31 | ### Create infrastructure from the CLI using Terraform
 32 | 
 33 | - Deploy with terraform
 34 | 
 35 | ```bash
 36 | $ terraform init
 37 | $ terraform apply -var "ssh_private_key=$(cat <location of your private ssh key>)" --auto-approve
 38 | ```
 39 | 
 40 | The deployment process should end with a list of private/public ip addresses like so:
 41 | 
 42 | ```bash
 43 | Apply complete! Resources: 9 added, 0 changed, 0 destroyed.
 44 | 
 45 | Outputs:
 46 | 
 47 | instance_private_ips = [
 48 |     10.1.nn.m
 49 | ]
 50 | instance_public_ips = [
 51 |     1xx.145.174.85
 52 | ]
 53 | 
 54 | ```
 55 | 
 56 | The public IP addresses are fairly dynamic in nature and could be between any range (example shown above). Please make a note of the Public IP above as it will be needed in the following steps.
 57 | 
 58 | ### Deploy the docker image with the notebooks and libraries
 59 | 
 60 | - use ssh and docker to make that end meet
 61 | 
 62 | ```bash
 63 | $ ./run-docker-container.sh
 64 | ```
 65 | 
 66 | ### Recover/retry from failed attempt
 67 | 
 68 | - Apply the fix to the configuration or script or both
 69 | - And run the below again:
 70 | 
 71 | ```bash
 72 | $ terraform apply -var "ssh_private_key=$(cat <location of your private ssh key>)" --auto-approve
 73 | ```
 74 | 
 75 | ### Start clean after a failed attempt (errors encountered)
 76 | 
 77 | - Run the below before proceeding:
 78 | 
 79 | ```bash
 80 | $ terraform destroy -var "ssh_private_key=$(cat <location of your private ssh key>)" --auto-approve
 81 | $ terraform apply -var "ssh_private_key=$(cat <location of your private ssh key>)" --auto-approve
 82 | ```
 83 | 
 84 | 
 85 | ### Destroy infrastructure (cleanup)
 86 | 
 87 | - Remove resources or destroy them with terraform
 88 | 
 89 | ```bash
 90 | $ terraform destroy -var "ssh_private_key=$(cat <location of your private ssh key>)" --auto-approve
 91 | ```
 92 | 
 93 | You should see something like this at the end of a successful run:
 94 | 
 95 | ```text
 96 | .
 97 | .
 98 | .
 99 | Destroy complete! Resources: 7 destroyed.
100 | ```
101 | 
102 | ### Security
103 | 
104 | Note that this setup does not take into account establishing a secure `http` i.e. `https` communication between the Jupyter lab instance and the browser. Please beware when using this in your target domain depending on the prerequisites you need to conform to. This example is good for learning and illustration purposes, please do NOT deploy it in production or public facing environments.
105 | 
106 | ---
107 | 
108 | Go to [Main page](../../README.md)


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/deploy/gcp/README.md:
--------------------------------------------------------------------------------
  1 |  # Deployment: Infrastructure as Code
  2 | 
  3 | The scripts in this folder give the ability to provision and manage compute capacity using [Google Cloud Platform](https://cloud.google.com/), in order to deploy the LLM Application and provision the Chainlit app.
  4 | 
  5 | In short, the scripts do the following:
  6 | - Create compute instances and associated network resources necessary to run an instance on GCP
  7 | - Create the necessary firewall configurations to allow services to communicate publicly over HTTP and HTTPS
  8 | - Set up an [Ollama](https://github.com/ollama/ollama) service on the Compute instance, and start it
  9 | - Finally, start the Chainlit app, and expose it on port 8000. The interface will be accessible on `<compute-instance-public-IP>:8000`
 10 | 
 11 | ![Preview of the Chainlit app](chainlit-app-demo.gif "Preview of the Chainlit app")
 12 | 
 13 | **Table of Contents**
 14 | - [Pre-requisites](#pre-requisites)
 15 | - [Provisioning Infrastructure using Terraform](#provisioning-infrastructure-using-terraform)
 16 |   - [Create a new project on Google Console](#create-a-new-project-on-google-console)
 17 |   - [Authenticate Terraform with GCloud credentials](#authenticate-terraform-with-gcloud-credentials)
 18 |   - [Create Cloud Bucket to store Terraform state](#create-cloud-bucket-to-store-terraform-state)
 19 |   - [Deploy with Terraform](#deploy-with-terraform)
 20 |   - [Destroy infrastructure (cleanup)](#destroy-infrastructure-cleanup)
 21 | - [Security](#security)
 22 | 
 23 | ## Pre-requisites
 24 | 
 25 | - A Google Console account with some credits. [If it's a new GCP account, you might get access to free $300 credits](https://cloud.google.com/free?hl=en)
 26 | - Install GCloud CLI. See [the official GCloud installation guide](https://cloud.google.com/sdk/docs/install).
 27 | - [Install Terraform](https://learn.hashicorp.com/terraform/getting-started/install.html) (all operating systems are supported)
 28 | - Clone this repository
 29 | 
 30 | ```bash
 31 | git clone https://github.com/neomatrix369/learning-path-index/
 32 | cd learning-path-index
 33 | cd app/llm-poc-variant-01/deploy/gcp
 34 | ```
 35 | 
 36 | For a summary (also helps to verify the steps) of the above steps please see [here](https://registry.terraform.io/providers/hashicorp/google/latest/docs).
 37 | 
 38 | ### Quick Terraform install script for Linux
 39 | 
 40 | ```bash
 41 | curl -sSL https://releases.hashicorp.com/terraform/1.9.8/terraform_1.9.8_linux_386.zip -o ~/terraform_1.9.8_linux_386.zip
 42 | 
 43 | unzip -q ~/terraform_1.9.8_linux_386.zip -d /tmp/terraform_1.9.8_linux_386
 44 | mv /tmp/terraform_1.9.8_linux_386/terraform /usr/local/bin/terraform
 45 | 
 46 | rm -rf /tmp/terraform_1.9.8_linux_386/
 47 | ```
 48 | 
 49 | 
 50 | 
 51 | ## Provisioning Infrastructure using Terraform
 52 | - #### Create a new project on Google Console.
 53 |   In your terminal, in the LPI repository folder, run:
 54 |   ```bash
 55 |   gcloud config set project <PROJECT_ID>
 56 |   ```
 57 | 
 58 | - #### Authenticate Terraform with GCloud credentials
 59 |   This workflow assumes you are working on a personal computer/workstation. For CI/CD pipelines, [other authentication steps are recommended](https://cloud.google.com/docs/terraform)
 60 |   ```bash
 61 |   gcloud auth application-default login
 62 |   ```
 63 | 
 64 | - #### Create Cloud Bucket to store Terraform state
 65 |   ```bash
 66 |   gsutil mb -l europe-west1 gs://llm-project-sbx-tf-state
 67 | 
 68 |   gsutil versioning set on gs://llm-project-sbx-tf-state
 69 |   ```
 70 | 
 71 |   Substitute `europe-west1` for [any other region of your choice](https://cloud.google.com/compute/docs/regions-zones).
 72 | 
 73 | - #### Deploy with terraform
 74 | 
 75 |   ```bash
 76 |   terraform init
 77 | 
 78 |   terraform workspace new llm-project
 79 | 
 80 |   terraform apply --auto-approve
 81 |   ```
 82 | 
 83 |   The deployment process should end with a list of private/public ip addresses like so:
 84 | 
 85 |   ```bash
 86 |   Apply complete! Resources: 1 added, 0 changed, 0 destroyed.
 87 | 
 88 |   Outputs:
 89 | 
 90 |   network_interface_0_access_config_0_nat_ip = "<REDACTED>"
 91 |   network_interface_0_network_ip = "<REDACTED>"
 92 |   self_link = "<REDACTED>"
 93 |   tags = toset([
 94 |     "http-server",
 95 |     "https-server",
 96 |     "lpi-sg",
 97 |   ])
 98 |   ```
 99 | 
100 |   The public IP addresses are fairly dynamic in nature and could be between any range (example shown above). Please make a note of the Public IP above as it will be needed in the following steps.
101 | 
102 | - #### SSH into the Compute Instance
103 | The compute instance can be accessed over SSH viz:
104 | ```bash
105 | gcloud compute ssh --project=<PROJECT_ID> --zone=<PROJECT_GCP_ZONE> lpi-cpu-vm
106 | ```
107 | 
108 | ### Destroy infrastructure (cleanup)
109 | 
110 | - Remove resources or destroy them with terraform
111 | 
112 | ```bash
113 | $ terraform destroy --var-file=project.tfvars --auto-approve
114 | ```
115 | 
116 | You should see something like this at the end of a successful run:
117 | 
118 | ```text
119 | .
120 | .
121 | .
122 | Destroy complete! Resources: 7 destroyed.
123 | ```
124 | 
125 | ### Security
126 | 
127 | Note that this setup does not take into account establishing a secure `http` i.e. `https` communication between the Chainlit instance and the browser, nor does it place emphasis on creating a fool-proof firewall for the compute instance. Please beware when using this in your target domain depending on the prerequisites you need to conform to. This example is good for learning and illustration purposes, please do NOT deploy it in production or public facing environments.
128 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/lpiGPT.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import os
  4 | import time
  5 | from datetime import datetime
  6 | 
  7 | import torch
  8 | from constants import CHROMA_SETTINGS
  9 | from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 10 | from langchain.chains import RetrievalQA
 11 | from langchain.embeddings import HuggingFaceEmbeddings
 12 | from langchain.llms import Ollama
 13 | from langchain.prompts import PromptTemplate
 14 | from langchain.vectorstores import Chroma
 15 | from langchain.vectorstores.base import VectorStoreRetriever
 16 | 
 17 | OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://localhost:11434')
 18 | 
 19 | 
 20 | def build_retriever(
 21 |     model_embeddings: str,
 22 |     persist_directory: str,
 23 |     target_source_chunks: int = 500,
 24 | ) -> VectorStoreRetriever:
 25 |     embeddings = HuggingFaceEmbeddings(model_name=model_embeddings)
 26 |     vector_db = Chroma(
 27 |         persist_directory,
 28 |         embedding_function=embeddings,
 29 |         client_settings=CHROMA_SETTINGS,
 30 |     )
 31 |     return vector_db.as_retriever(search_kwargs={'k': target_source_chunks})
 32 | 
 33 | 
 34 | def build_prompt():
 35 |     """
 36 |     Reference/Guide:
 37 |     - https://smith.langchain.com/hub/rlm/rag-prompt-mistral
 38 |     - https://smith.langchain.com/hub/rlm/rag-prompt-llama
 39 |     """
 40 |     prompt_template = """
 41 |         [INST]
 42 |         <<SYS>> You are an assistant for question-answering tasks using the Learning Path Index.
 43 |         Show the results in a table or tabular form, and the results must contain a link for each line of the courses, modules or sub-modules returned.
 44 |         <</SYS>>
 45 |         Context: {context}
 46 |         Question: {question}
 47 |         Answer: [/INST]
 48 |     """
 49 |     return PromptTemplate(
 50 |         template=prompt_template, input_variables=['context', 'question']
 51 |     )
 52 | 
 53 | 
 54 | def build_model(
 55 |     retriever: VectorStoreRetriever,
 56 |     model_name: str = 'gemma:2b',
 57 |     mute_stream: bool = False,
 58 | ):
 59 |     IS_GPU_AVAILABLE = torch.cuda.is_available()
 60 |     (
 61 |         print(
 62 |             f'~~~ GPU is available (CUDA-DNN Enabled: {torch.backends.cudnn.enabled}) ~~~'
 63 |         )
 64 |         if IS_GPU_AVAILABLE
 65 |         else print('~~~ GPU is NOT available, falling back to CPU ~~~')
 66 |     )
 67 |     start = time.time()
 68 | 
 69 |     # activate/deactivate the streaming StdOut callback for LLMs
 70 |     callbacks = [] if mute_stream else [StreamingStdOutCallbackHandler()]
 71 |     llm = Ollama(model=model_name, callbacks=callbacks, base_url=OLLAMA_HOST)
 72 |     qa = RetrievalQA.from_chain_type(
 73 |         llm=llm,
 74 |         chain_type='stuff',
 75 |         retriever=retriever,
 76 |         return_source_documents=True,
 77 |         chain_type_kwargs={'prompt': build_prompt()},
 78 |     )
 79 | 
 80 |     end = time.time()
 81 | 
 82 |     print(f'Models took about {end - start} seconds to load.')
 83 |     return qa, llm
 84 | 
 85 | 
 86 | def parse_arguments():
 87 |     parser = argparse.ArgumentParser(
 88 |         description='lpiGPT: Ask questions to your documents without an internet connection, '
 89 |         'using the power of LLMs (the InstructGPT or Chat model).'
 90 |     )
 91 |     # https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard or https://ollama.ai/library
 92 |     parser.add_argument(
 93 |         '--chat-model',
 94 |         '-CM',
 95 |         action='store',
 96 |         default='gemma:2b',
 97 |         help='Use this flag to set the InstructGPT or Chat model name, see https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard or https://ollama.ai/library for more names.',
 98 |     )
 99 |     # For embeddings model, the example uses a sentence-transformers model
100 |     # https://www.sbert.net/docs/pretrained_models.html
101 |     # "The all-mpnet-base-v2 model provides the best quality, while all-MiniLM-L6-v2 is 5 times faster
102 |     # and still offers good quality."
103 |     parser.add_argument(
104 |         '--embeddings-model-name',
105 |         '-EM',
106 |         action='store',
107 |         default='all-MiniLM-L6-v2',
108 |         help='Use this flag to set the Embeddings model name, see https://www.sbert.net/docs/pretrained_models.html for examples of names. Use the same model as used for ingesting the documents (ingest.py)',
109 |     )
110 | 
111 |     parser.add_argument(
112 |         '--persist-directory',
113 |         '-P',
114 |         action='store',
115 |         default='vector_db',
116 |         help='Use this flag to specify the name of the vector database, this will be a folder on the local machine.',
117 |     )
118 | 
119 |     parser.add_argument(
120 |         '--target-source-chunks',
121 |         '-C',
122 |         action='store',
123 |         default=500,
124 |         help='Use this flag to specify the name chunk size to use to chunk source data.',
125 |     )
126 | 
127 |     parser.add_argument(
128 |         '--hide-source',
129 |         '-S',
130 |         action='store_true',
131 |         help='Use this flag to disable printing of source documents used for answers.',
132 |     )
133 | 
134 |     parser.add_argument(
135 |         '--mute-stream',
136 |         '-M',
137 |         action='store_true',
138 |         help='Use this flag to disable the streaming StdOut callback for LLMs.',
139 |     )
140 | 
141 |     return parser.parse_args()
142 | 
143 | 
144 | def main():
145 |     args = parse_arguments()
146 |     retriever = build_retriever(args.embeddings_model_name, args.persist_directory)
147 |     qa, _llm = build_model(
148 |         retriever,
149 |         model_name=args.chat_model,
150 |         mute_stream=args.mute_stream,
151 |     )
152 |     # Interactive questions and answers
153 |     while True:
154 |         query = input('\nEnter a query: ')
155 |         if query == 'exit':
156 |             break
157 |         if query.strip() == '':
158 |             continue
159 | 
160 |         # Get the answer from the chain
161 |         start = time.time()
162 |         print(
163 |             f"\nStart time: {datetime.utcfromtimestamp(start).strftime('%Y-%m-%d %H:%M:%S')}"
164 |         )
165 |         answer = qa({'query': query})
166 |         answer, docs = (
167 |             answer['result'],
168 |             answer.get('source_documents', []),
169 |         )
170 |         end = time.time()
171 | 
172 |         # Print the result
173 |         print('\n\n> Question:')
174 |         print(query)
175 |         print(
176 |             f"\nEnd time: {datetime.utcfromtimestamp(end).strftime('%Y-%m-%d %H:%M:%S')}"
177 |         )
178 |         print(f'\nAnswer (took about {end - start} seconds):')
179 |         print(answer)
180 | 
181 |         # Print the relevant sources used for the answer
182 |         if not args.hide_source:
183 |             for document in docs:
184 |                 print('\n> ' + document.metadata['source'] + ':')
185 |                 print(document.page_content)
186 | 
187 | 
188 | if __name__ == '__main__':
189 |     main()
190 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Learning Path Index
  2 | A repo with data files, assets and code supporting and powering the Learning Path Index.
  3 | 
  4 | Table of content
  5 | 
  6 | - [Overview](#overview)
  7 | - [Key Features](#key-features)
  8 | - [Potential Innovations](#potential-innovations)
  9 | - [How to contribute to this initiative?](#how-to-contribute-to-this-initiative)
 10 | - [Important Links](#important-links)
 11 | - [Credits](#credits)
 12 | 
 13 | ## Overview
 14 | 
 15 | The **Learning Path Index** is a dynamic and versatile repository designed to empower learners in the fields of Data Science and Machine Learning. It offers a curated collection of byte-sized courses and learning materials, meticulously organized and tagged to facilitate effortless discovery. Whether you're a novice or a seasoned practitioner, the Learning Path Index is your gateway to knowledge, tailored to your interests and needs.
 16 | 
 17 | The outcome of this effort was the creation of this _Git repo_ and the KaggleX [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset) by the mentors and mentees of **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between August 2023 and November 2023), see the [Credits](#credits) section for more details.
 18 | 
 19 | 
 20 | ## Key Features
 21 | 
 22 | ### 1. Comprehensive Collection
 23 |    - A vast array of byte-sized courses and learning materials covering Data Science and Machine Learning topics.
 24 |    - Courses are categorized and tagged by keywords, categories, topics, and interests, all closely aligned with the world of Data Science and Machine Learning.
 25 | 
 26 | ### 2. Robust Search and Filtering
 27 |    - Effortless search and filtering capabilities allow you to find the content you need quickly.
 28 |    - Search by full or partial text, including keywords, categories, topics, and interests.
 29 | 
 30 | ### 3. Collaborative Contribution
 31 |    - Easy-to-use mechanisms for adding new courses and enhancing existing entries.
 32 |    - Contribute your expertise and help refine course definitions for the benefit of the entire community.
 33 | 
 34 | ### [4. Automated Data/Course Scraping *(WIP)*:](./app/course-scraper)
 35 |    - Automatically scrape course information and details from multiple platforms.
 36 |    - Data enrichment and augmentation using AI!
 37 | 
 38 | ### [5. Keyword Extraction with KeyBERT and WordWise-Kaggle Notebook](https://github.com/neomatrix369/learning-path-index/blob/main/app/Keyword%20Extraction%20with%20KeyBERT%20and%20WordWise.ipynb)
 39 | 
 40 | ### [6. Learning Pathway Index Data Cleaning and Preprocessing](https://www.kaggle.com/code/manishkr1754/lpi-data-cleaning-and-preprocessing/notebook)
 41 | 
 42 | ### [7. Contextual Search On Kaggle Learning Path Index](https://github.com/mbhoge/learning-path-index-rag-search/blob/learning-path-index-search-01/app/llm-poc-variant-02/README.md)
 43 | 
 44 | ## Getting Started and Setup
 45 | Please refer to the getting started guide, [Getting Started](getting-started.md), for setup instructions.
 46 | 
 47 | ## Potential Innovations
 48 | 
 49 | Explore exciting possibilities for enhancing the Learning Path Index:
 50 | 
 51 | 1. **Course Chunking**: Divide pending courses into byte-sized modules for a more digestible learning experience.
 52 | 
 53 | 2. **Content Enrichment**: Assist in fine-tuning, correcting, and enriching existing byte-sized entries to ensure high-quality learning materials.
 54 | 
 55 | 3. **Kaggle Dataset**: Transform the Learning Path Index into a dataset and host it on Kaggle Datasets for broader accessibility.
 56 | 
 57 | 4. **Keyword Extraction**: Automatically extract keywords from course websites and byte-sized modules to enhance search functionality.
 58 | 
 59 | 5. **Exploratory Data Analysis (EDA)**: Conduct exploratory data analysis on course materials to gain valuable insights into the content of the datasets.
 60 | 
 61 | 6. **NLP Profiler**: Implement NLP Profiler and Pandas Profiler to analyze courses by various parameters, uncovering hidden patterns.
 62 | 
 63 | 7. **Interactive Learning**: Develop a Streamlit, Shiny, or Mercury app to make these courses available online, fostering an interactive learning environment.
 64 | 
 65 | 8. **Cloud Hosting**: Deploy the app on popular cloud platforms like Heroku, Netlify, AWS, or others for widespread accessibility.
 66 | 
 67 | 9. **Notebook Integration**: Create Google Colab, Kaggle Notebook, Amazon Notebook, or Interactive Jupyter Notebook integrations to facilitate seamless course exploration.
 68 | 
 69 | 10. **NLP Enhancement**: Apply advanced NLP techniques to the existing data to extract deeper linguistic value and meaning.
 70 | 
 71 | 11. **Generative AI**: Utilize the dataset to build Language Model (LLM) and Generative AI models, opening doors to innovative AI-related activities.
 72 | 
 73 | 12. **Continuous Improvement**: Brainstorm and implement additional ideas to enhance the tool's utility for both the community and individuals.
 74 | 
 75 | Join us in this exciting journey of learning, collaboration, and innovation. Together, we can create a valuable resource for the Data Science and Machine Learning community. Let's embark on the path to knowledge and discovery!
 76 | 
 77 | 
 78 | ## How to contribute to this initiative?
 79 | 
 80 | - You can also join us by taking part in the next [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607))
 81 | - Keep your eyes open on the **Kaggle Discussions** page and other **KaggleX** social media channels. Or find us on the [Kaggle Discord](https://www.kaggle.com/discussions/general/429933) channel to learn more about the next steps
 82 | - Create notebooks from this data
 83 | - Create supplementary or complementary data for or from this dataset
 84 | - Submit corrections/enhancements or anything else to help improve this dataset so it has a wider use and purpose
 85 | 
 86 | ## Important Links
 87 | 
 88 | - [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607))
 89 | - KaggleX [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset)
 90 | - KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index)
 91 | - [New Official Kaggle Discord Server!](https://www.kaggle.com/discussions/general/429933)
 92 | 
 93 | ## Credits
 94 | 
 95 | Credits for all the work done to create this Git Repo and the KaggleX [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset/data) goes to these mentors and mentees (in no particular order): [Manish Kumar](https://www.kaggle.com/manishkr1754), [Ben Aji](https://www.kaggle.com/benajii) (_mentor_), [Emmanuel Katchy](https://www.kaggle.com/tobetek), [Ezeogu Ernest](https://www.kaggle.com/tobetek), [Manish](https://www.kaggle.com/manish5), [Mustafa](https://www.kaggle.com/mustafa254), [Nnamdi Idowu-Anifowoshe](https://www.kaggle.com/idowuchukwudi), [Sheba Alkali](https://www.kaggle.com/shebaalkali), [Zainab ikeoluwa](https://www.kaggle.com/zainabikeoluwa), [Wendy Mak](https://www.kaggle.com/wwymak) (_mentor_), [Misirya Hameed](https://www.linkedin.com/in/misiriya-shahul-hameed-b3957875) (_mentor_), [Chukwuebuka Obi](https://www.kaggle.com/chukwuebukaobi), [Victor Umunna](https://www.kaggle.com/victorumunna), [Pui Yueng](https://www.kaggle.com/lorentzyeung), [Afolake Solomon](https://www.kaggle.com/flakkyddon), [Faith Osoro](https://www.kaggle.com/faithosoro), [Chukwudi Idowu](https://www.kaggle.com/chukwudiidowu) and many others who were part of the **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between August 2023 and November 2023).
 96 | 
 97 | Our gratitude also goes to our silent supporters of this initiative from organisers to the mentors and mentees whose help and support kept us going.
 98 | 
 99 | _**Note:** In case your name or mention is missed out in the above list, then please let us know._
100 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/ingest.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import glob
  4 | import os
  5 | import time
  6 | from multiprocessing import Pool
  7 | from typing import List
  8 | 
  9 | from constants import CHROMA_SETTINGS
 10 | from langchain.docstore.document import Document
 11 | from langchain.document_loaders import CSVLoader
 12 | from langchain.embeddings import HuggingFaceEmbeddings
 13 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 14 | from langchain.vectorstores import Chroma
 15 | from tqdm import tqdm
 16 | 
 17 | # Map file extensions to document loaders and their arguments
 18 | LOADER_MAPPING = {
 19 |     '.csv': (CSVLoader, {}),
 20 |     # Add more mappings for other file extensions and loaders as needed
 21 | }
 22 | 
 23 | 
 24 | def load_single_document(file_path: str) -> List[Document]:
 25 |     ext = '.' + file_path.rsplit('.', 1)[-1]
 26 |     if ext in LOADER_MAPPING:
 27 |         loader_class, loader_args = LOADER_MAPPING[ext]
 28 |         loader = loader_class(file_path, **loader_args)
 29 |         return loader.load()
 30 | 
 31 |     raise ValueError(f"Unsupported file extension '{ext}'")
 32 | 
 33 | 
 34 | def load_documents(source_dir: str, ignored_files: List[str] = None) -> List[Document]:
 35 |     """
 36 |     Loads all documents from the source documents directory, ignoring specified files
 37 |     """
 38 |     if not ignored_files:
 39 |         ignored_files = []
 40 |     all_files = []
 41 |     for ext in LOADER_MAPPING:
 42 |         all_files.extend(
 43 |             glob.glob(os.path.join(source_dir, f'**/*{ext}'), recursive=True)
 44 |         )
 45 |     filtered_files = [
 46 |         file_path for file_path in all_files if file_path not in ignored_files
 47 |     ]
 48 | 
 49 |     with Pool(processes=os.cpu_count()) as pool:
 50 |         results = []
 51 |         with tqdm(
 52 |             total=len(filtered_files), desc='Loading new documents', ncols=80
 53 |         ) as pbar:
 54 |             for _, docs in enumerate(
 55 |                 pool.imap_unordered(load_single_document, filtered_files)
 56 |             ):
 57 |                 results.extend(docs)
 58 |                 pbar.update()
 59 | 
 60 |     return results
 61 | 
 62 | 
 63 | def process_documents(
 64 |     source_documents: str,
 65 |     chunk_size: int,
 66 |     chunk_overlap: int,
 67 |     ignored_files: List[str] = None,
 68 | ) -> List[Document]:
 69 |     """
 70 |     Load documents and split in chunks
 71 |     """
 72 |     if not ignored_files:
 73 |         ignored_files = []
 74 |     start_time = time.time()
 75 |     print(f'Loading documents from {source_documents}')
 76 | 
 77 |     documents = load_documents(source_documents, ignored_files)
 78 |     if not documents:
 79 |         print('No new documents to load')
 80 |         exit(0)
 81 |     print(f'Loaded {len(documents)} new documents from {source_documents}')
 82 | 
 83 |     text_splitter = RecursiveCharacterTextSplitter(
 84 |         chunk_size=chunk_size, chunk_overlap=chunk_overlap
 85 |     )
 86 |     texts = text_splitter.split_documents(documents)
 87 |     print(f'Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)')
 88 |     end_time = time.time()
 89 |     print(f'Loading documents took about {end_time - start_time} seconds to complete.')
 90 |     return texts
 91 | 
 92 | 
 93 | def does_vectorstore_exist(persist_directory: str) -> bool:
 94 |     """
 95 |     Checks if vectorstore exists
 96 |     """
 97 |     if os.path.exists(os.path.join(persist_directory, 'index')) and (
 98 |         os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet'))
 99 |         and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet'))
100 |     ):
101 |         list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
102 |         list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
103 |         # At least 3 documents are needed in a working vectorstore
104 |         if len(list_index_files) > 3:
105 |             return True
106 |     return False
107 | 
108 | 
109 | def parse_arguments():
110 |     parser = argparse.ArgumentParser(
111 |         description='ingest: process one or more documents (text) in order to create embeddings (using the Embeddings models)'
112 |         'from them, and make them ready to be used with LLMs when a question is asked to the InstructGPT or Chat Model.'
113 |     )
114 |     # For embeddings model, the example uses a sentence-transformers model
115 |     # https://www.sbert.net/docs/pretrained_models.html
116 |     # "The all-mpnet-base-v2 model provides the best quality, while all-MiniLM-L6-v2 is 5 times faster
117 |     # and still offers good quality."
118 |     parser.add_argument(
119 |         '--embeddings-model-name',
120 |         '-EM',
121 |         action='store',
122 |         default='all-MiniLM-L6-v2',
123 |         help='Use this flag to set the Embeddings model name, see https://www.sbert.net/docs/pretrained_models.html for examples of names. Use the same model when running the lpiGPT.py app.',
124 |     )
125 | 
126 |     parser.add_argument(
127 |         '--source-documents',
128 |         '-S',
129 |         action='store',
130 |         default='source_documents',
131 |         help='Use this flag to specify the name of the folder where all the (source/input) documents are stored for ingestion purposes, on the local machine. The documents contained in them are of the type `.csv`.',
132 |     )
133 | 
134 |     parser.add_argument(
135 |         '--persist-directory',
136 |         '-P',
137 |         action='store',
138 |         default='vector_db',
139 |         help='Use this flag to specify the name of the vector database, this will be a folder on the local machine.',
140 |     )
141 | 
142 |     parser.add_argument(
143 |         '--target-source-chunks',
144 |         '-C',
145 |         action='store',
146 |         default=500,
147 |         help='Use this flag to specify the name chunk size to use to chunk source data.',
148 |     )
149 | 
150 |     parser.add_argument(
151 |         '--chunk-overlap',
152 |         '-O',
153 |         action='store',
154 |         default=50,
155 |         help='Use this flag to specify the name chunk overlap value to use to chunk source data.',
156 |     )
157 | 
158 |     return parser.parse_args()
159 | 
160 | 
161 | def main():
162 |     args = parse_arguments()
163 | 
164 |     start_time = time.time()
165 |     # Create embeddings
166 |     print('\nCreating/downloading HF embeddings started...')
167 |     embeddings = HuggingFaceEmbeddings(model_name=args.embeddings_model_name)
168 |     end_time = time.time()
169 |     print(
170 |         f'Creating/downloading HF embeddings completed! It took about {end_time - start_time} seconds to complete.'
171 |     )
172 | 
173 |     start_time = time.time()
174 |     print('\nStarted with ingestion process, to create vector database...')
175 |     if does_vectorstore_exist(args.persist_directory):
176 |         # Update and store locally vectorstore
177 |         print(f'-- Appending to existing vectorstore at {args.persist_directory}')
178 |         vector_db = Chroma(
179 |             persist_directory=args.persist_directory,
180 |             embedding_function=embeddings,
181 |             client_settings=CHROMA_SETTINGS,
182 |         )
183 |         collection = vector_db.get()
184 |         texts = process_documents(
185 |             args.source_documents,
186 |             args.target_source_chunks,
187 |             args.chunk_overlap,
188 |             [metadata['source'] for metadata in collection['metadatas']],
189 |         )
190 |         print('-- Creating embeddings. May take some minutes...')
191 |         vector_db.add_documents(texts)
192 |     else:
193 |         # Create and store locally vectorstore
194 |         print('-- Creating new vectorstore')
195 |         texts = process_documents(
196 |             args.source_documents, args.target_source_chunks, args.chunk_overlap
197 |         )
198 |         print('-- Creating embeddings. May take some minutes...')
199 |         vector_db = Chroma.from_documents(
200 |             texts,
201 |             embeddings,
202 |             persist_directory=args.persist_directory,
203 |             client_settings=CHROMA_SETTINGS,
204 |         )
205 |     vector_db.persist()
206 |     vector_db = None
207 |     end_time = time.time()
208 | 
209 |     print(
210 |         f'Ingestion complete! It took about {end_time - start_time} seconds to complete.'
211 |     )
212 |     print('\nYou can now run lpiGPT.py to query your documents')
213 | 
214 | 
215 | if __name__ == '__main__':
216 |     main()
217 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-02/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from dotenv import load_dotenv
  3 | from datetime import datetime
  4 | import time
  5 | from langchain.llms import OpenAI
  6 | from langchain.document_loaders import TextLoader
  7 | from langchain.text_splitter import CharacterTextSplitter
  8 | from langchain.embeddings.openai import OpenAIEmbeddings
  9 | from langchain.chains import RetrievalQA
 10 | from langchain.prompts import PromptTemplate
 11 | from langchain.llms import OpenAI
 12 | from langchain.vectorstores import FAISS
 13 | from langchain.prompts import PromptTemplate
 14 | 
 15 | from interface import app
 16 | import streamlit as st
 17 | # Define GenerateLearningPathIndexEmbeddings class: 
 18 | #  - Load .csv file
 19 | #  - Chunk text
 20 | #    - Chunk size = 1000 characters
 21 | #    - Chunk overlap = 30 characters
 22 | #  - Create FAISS vector store from chunked text and OpenAI embeddings
 23 | #  - Get FAISS vector store
 24 | # This class is used to generate the FAISS vector store from the .csv file.
 25 | class GenerateLearningPathIndexEmbeddings:
 26 |     def __init__(self, csv_filename):
 27 |         load_dotenv()  # Load .env file
 28 |         self.openai_api_key = os.getenv("OPENAI_API_KEY")
 29 |         # load the csv file from the data folder above 2 folders
 30 |         self.data_path = os.path.join('..\..\data', csv_filename)
 31 |         self.our_custom_data = None
 32 |         self.openai_embeddings = None
 33 |         self.faiss_vectorstore = None
 34 | 
 35 |         self.load_csv_data()
 36 |         self.get_openai_embeddings()
 37 |         self.create_faiss_vectorstore_with_csv_data_and_openai_embeddings()
 38 |            
 39 |     def load_csv_data(self):
 40 |         # Load your dataset (e.g., CSV, JSON, etc.)
 41 |         print(' -- Started loading .csv file for chunking purposes.')
 42 |         loader = TextLoader(self.data_path)
 43 |         document = loader.load()
 44 |         text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=30, separator="\n")
 45 |         self.our_custom_data = text_splitter.split_documents(document)
 46 |         print(f' -- Finished spitting (i.e. chunking) text (i.e. documents) from the .csv file (i.e. {self.data_path}).')
 47 |         
 48 |     def get_openai_embeddings(self):
 49 |         self.openai_embeddings = OpenAIEmbeddings(openai_api_key=self.openai_api_key, request_timeout=60)
 50 |         
 51 |     def create_faiss_vectorstore_with_csv_data_and_openai_embeddings(self):
 52 |         faiss_vectorstore_foldername = "faiss_learning_path_index"
 53 |         if not os.path.exists(faiss_vectorstore_foldername):
 54 |             print(' -- Creating a new FAISS vector store from chunked text and OpenAI embeddings.')
 55 |             vectorstore = FAISS.from_documents(self.our_custom_data, self.openai_embeddings)
 56 |             vectorstore.save_local(faiss_vectorstore_foldername)
 57 |             print(f' -- Saved the newly created FAISS vector store at "{faiss_vectorstore_foldername}".')
 58 |         else:
 59 |             print(f' -- WARNING: Found existing FAISS vector store at "{faiss_vectorstore_foldername}", loading from cache.')
 60 |             print(f' -- NOTE: Delete the FAISS vector store at "{faiss_vectorstore_foldername}", if you wish to regenerate it from scratch for the next run.')
 61 |         self.faiss_vectorstore = FAISS.load_local(
 62 |             "faiss_learning_path_index", self.openai_embeddings
 63 |         )
 64 | 
 65 |     def get_faiss_vector_store(self):
 66 |         return self.faiss_vectorstore
 67 | 
 68 | 
 69 | # https://discuss.streamlit.io/t/how-to-check-if-code-is-run-inside-streamlit-and-not-e-g-ipython/23439/7
 70 | def running_inside_streamlit():
 71 |     """
 72 |     Function to check whether python code is run within streamlit
 73 | 
 74 |     Returns
 75 |     -------
 76 |     use_streamlit : boolean
 77 |         True if code is run within streamlit, else False
 78 |     """
 79 |     try:
 80 |         from streamlit.runtime.scriptrunner import get_script_run_ctx
 81 |         if not get_script_run_ctx():
 82 |             use_streamlit = False
 83 |         else:
 84 |             use_streamlit = True
 85 |     except ModuleNotFoundError:
 86 |         use_streamlit = False
 87 |     return use_streamlit
 88 | 
 89 | 
 90 | # Define GenAI class:
 91 | #  - Create prompt template
 92 | #  - Create GenAI project
 93 | #  - Get response for query
 94 | # This class is used to get the response for a query from the GenAI project.
 95 | # The GenAI project is created from the FAISS vector store.
 96 | class GenAILearningPathIndex:
 97 |     def __init__(self, faiss_vectorstore):
 98 |         load_dotenv()  # Load .env file
 99 |         self.openai_api_key = os.getenv("OPENAI_API_KEY")
100 |         self.faiss_vectorstore = faiss_vectorstore
101 | 
102 |         prompt_template = \
103 |             """
104 |                 Use the following template to answer the question at the end, 
105 |                 from the Learning Path Index csv file,
106 |                 display top 10 results in a tablular format and it 
107 |                 should look like this:
108 |                 | Learning Pathway | duration  | link | Module
109 |                 | --- | --- | --- | --- |
110 |                 | ... | ... | ... | ... |
111 |                 it must contain a link for each line of the result in a table,
112 |                 consider the duration and Module information mentioned in the question,
113 |                 If you don't know the answer, don't make an entry in the table,
114 |                 {context}
115 |                 Question: {question}
116 |             """
117 |         PROMPT = PromptTemplate(template=prompt_template, input_variables=["context","question"])
118 |         # The chain_type_kwargs are passed to the chain_type when it is created.
119 |         self.chain_type_kwargs = {"prompt": PROMPT}
120 |         # Create the GenAI project 
121 |         self.llm = OpenAI(temperature=1.0, openai_api_key=self.openai_api_key)
122 |     # Get response for query
123 |     # The response is returned as a string.   
124 |        
125 |     def get_response_for(self, query: str):
126 |         qa = RetrievalQA.from_chain_type(
127 |             llm=self.llm, chain_type="stuff", 
128 |             retriever=self.faiss_vectorstore.as_retriever(),
129 |             chain_type_kwargs=self.chain_type_kwargs
130 |         )
131 |         return qa.run(query)
132 | 
133 | def get_formatted_time(current_time = time.time()):
134 |     return datetime.utcfromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')
135 | 
136 | #   Load the model
137 | @st.cache_data
138 | def load_model():
139 |     start_time = time.time()
140 |     print(f"\nStarted loading custom embeddings (created from .csv file) at {get_formatted_time(start_time)}")
141 |     learningPathIndexEmbeddings = GenerateLearningPathIndexEmbeddings("Learning_Pathway_Index.csv")
142 |     faiss_vectorstore = learningPathIndexEmbeddings.get_faiss_vector_store()
143 |     end_time = time.time()
144 |     print(f"Finished loading custom embeddings (created from .csv file) at {get_formatted_time(end_time)}")
145 |     print(f"Custom embeddings (created from .csv file) took about {end_time - start_time} seconds to load.")
146 |     return faiss_vectorstore
147 | 
148 | #  Query the model
149 | def query_gpt_model(query: str):
150 |     start_time = time.time()
151 |     print(f"\nQuery processing start time: {get_formatted_time(start_time)}")
152 |     genAIproject = GenAILearningPathIndex(faiss_vectorstore)
153 |     answer = genAIproject.get_response_for(query)
154 |     end_time = time.time()
155 |     print(f"\nQuery processing finish time: {get_formatted_time(end_time)}")
156 |     print(f"\nAnswer (took about {end_time - start_time} seconds)")
157 |     return answer
158 | 
159 | 
160 | if __name__=='__main__':
161 |     faiss_vectorstore = load_model()
162 | 
163 |     if running_inside_streamlit():
164 |         print("\nStreamlit environment detected. \nTo run a CLI interactive version just run `python main.py` in the CLI.\n")
165 |         query_from_stream_list = app()
166 |         if query_from_stream_list:
167 |             answer = query_gpt_model(query_from_stream_list)
168 |             st.write(answer)
169 |     else:
170 |         print("\nCommand-line interactive environment detected.\n")
171 |         while True:
172 |             query = input("\nEnter a query: ")
173 |             if query == "exit":
174 |                 break
175 |             if query.strip() == "":
176 |                 continue
177 | 
178 |             if query:
179 |                 answer = query_gpt_model(query)
180 | 
181 |                 print("\n\n> Question:")
182 |                 print(query)
183 |                 print(answer)
184 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-02/learning_path_index_contextual_search.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import os\n","from dotenv import load_dotenv\n","from datetime import datetime\n","import time\n","from langchain.llms import OpenAI\n","from langchain.document_loaders import TextLoader\n","from langchain.text_splitter import CharacterTextSplitter\n","from langchain.embeddings.openai import OpenAIEmbeddings\n","from langchain.chains import RetrievalQA\n","from langchain.prompts import PromptTemplate\n","from langchain.llms import OpenAI\n","from langchain.vectorstores import FAISS\n","from langchain.prompts import PromptTemplate"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from interface import app\n","import streamlit as st\n","# Define GenerateLearningPathIndexEmbeddings class: \n","#  - Load .csv file\n","#  - Chunk text\n","#    - Chunk size = 1000 characters\n","#    - Chunk overlap = 30 characters\n","#  - Create FAISS vector store from chunked text and OpenAI embeddings\n","#  - Get FAISS vector store\n","# This class is used to generate the FAISS vector store from the .csv file.\n","class GenerateLearningPathIndexEmbeddings:\n","    def __init__(self, csv_filename):\n","        load_dotenv()  # Load .env file\n","        self.openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n","        self.data_path = os.path.join('..\\..\\data', csv_filename)\n","        self.our_custom_data = None\n","        self.openai_embeddings = None\n","        self.faiss_vectorstore = None\n","        self.load_csv_data()\n","        self.get_openai_embeddings()\n","        self.create_faiss_vectorstore_with_csv_data_and_openai_embeddings()\n","           \n","    def load_csv_data(self):\n","        # Load your dataset (e.g., CSV, JSON, etc.)\n","        print(' -- Started loading .csv file for chunking purposes.')\n","        loader = TextLoader(self.data_path)\n","        document = loader.load()\n","        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=30, separator=\"\\n\")\n","        self.our_custom_data = text_splitter.split_documents(document)\n","        print(f' -- Finished spitting (i.e. chunking) text (i.e. documents) from the .csv file (i.e. {self.data_path}).')\n","        \n","    def get_openai_embeddings(self):\n","        self.openai_embeddings = OpenAIEmbeddings(openai_api_key=self.openai_api_key, request_timeout=60)\n","        \n","    def create_faiss_vectorstore_with_csv_data_and_openai_embeddings(self):\n","        faiss_vectorstore_foldername = \"faiss_learning_path_index\"\n","        if not os.path.exists(faiss_vectorstore_foldername):\n","            print(' -- Creating a new FAISS vector store from chunked text and OpenAI embeddings.')\n","            vectorstore = FAISS.from_documents(self.our_custom_data, self.openai_embeddings)\n","            vectorstore.save_local(faiss_vectorstore_foldername)\n","            print(f' -- Saved the newly created FAISS vector store at \"{faiss_vectorstore_foldername}\".')\n","        else:\n","            print(f' -- WARNING: Found existing FAISS vector store at \"{faiss_vectorstore_foldername}\", loading from cache.')\n","            print(f' -- NOTE: Delete the FAISS vector store at \"{faiss_vectorstore_foldername}\", if you wish to regenerate it from scratch for the next run.')\n","        self.faiss_vectorstore = FAISS.load_local(\n","            \"faiss_learning_path_index\", self.openai_embeddings\n","        )\n","    def get_faiss_vector_store(self):\n","        return self.faiss_vectorstore"]},{"cell_type":"markdown","metadata":{},"source":["https://discuss.streamlit.io/t/how-to-check-if-code-is-run-inside-streamlit-and-not-e-g-ipython/23439/7"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["def running_inside_streamlit():\n","    \"\"\"\n","    Function to check whether python code is run within streamlit\n","    Returns\n","    -------\n","    use_streamlit : boolean\n","        True if code is run within streamlit, else False\n","    \"\"\"\n","    try:\n","        from streamlit.runtime.scriptrunner import get_script_run_ctx\n","        if not get_script_run_ctx():\n","            use_streamlit = False\n","        else:\n","            use_streamlit = True\n","    except ModuleNotFoundError:\n","        use_streamlit = False\n","    return use_streamlit"]},{"cell_type":"markdown","metadata":{},"source":["Define GenAI class:<br>\n"," - Create prompt template<br>\n"," - Create GenAI project<br>\n"," - Get response for query<br>\n","This class is used to get the response for a query from the GenAI project.<br>\n","The GenAI project is created from the FAISS vector store."]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["class GenAILearningPathIndex:\n","    def __init__(self, faiss_vectorstore):\n","        load_dotenv()  # Load .env file\n","        self.openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n","        self.faiss_vectorstore = faiss_vectorstore\n","        prompt_template = \\\n","            \"\"\"\n","                Use the following template to answer the question at the end, \n","                from the Learning Path Index csv file,\n","                display top 10 results in a tablular format and it \n","                should look like this:\n","                | Learning Pathway | duration  | link | Module\n","                | --- | --- | --- | --- |\n","                | ... | ... | ... | ... |\n","                it must contain a link for each line of the result in a table,\n","                consider the duration and Module information mentioned in the question,\n","                If you don't know the answer, don't make an entry in the table,\n","                {context}\n","                Question: {question}\n","            \"\"\"\n","        PROMPT = PromptTemplate(template=prompt_template, input_variables=[\"context\",\"question\"])\n","        # The chain_type_kwargs are passed to the chain_type when it is created.\n","        self.chain_type_kwargs = {\"prompt\": PROMPT}\n","        # Create the GenAI project \n","        self.llm = OpenAI(temperature=1.0, openai_api_key=self.openai_api_key)\n","    # Get response for query\n","    # The response is returned as a string.   \n","       \n","    def get_response_for(self, query: str):\n","        qa = RetrievalQA.from_chain_type(\n","            llm=self.llm, chain_type=\"stuff\", \n","            retriever=self.faiss_vectorstore.as_retriever(),\n","            chain_type_kwargs=self.chain_type_kwargs\n","        )\n","        return qa.run(query)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["def get_formatted_time(current_time = time.time()):\n","    return datetime.utcfromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')"]},{"cell_type":"markdown","metadata":{},"source":["  Load the model"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["@st.cache_data\n","def load_model():\n","    start_time = time.time()\n","    print(f\"\\nStarted loading custom embeddings (created from .csv file) at {get_formatted_time(start_time)}\")\n","    learningPathIndexEmbeddings = GenerateLearningPathIndexEmbeddings(\"Learning_Pathway_Index.csv\")\n","    faiss_vectorstore = learningPathIndexEmbeddings.get_faiss_vector_store()\n","    end_time = time.time()\n","    print(f\"Finished loading custom embeddings (created from .csv file) at {get_formatted_time(end_time)}\")\n","    print(f\"Custom embeddings (created from .csv file) took about {end_time - start_time} seconds to load.\")\n","    return faiss_vectorstore"]},{"cell_type":"markdown","metadata":{},"source":[" Query the model"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["def query_gpt_model(query: str):\n","    start_time = time.time()\n","    print(f\"\\nQuery processing start time: {get_formatted_time(start_time)}\")\n","    genAIproject = GenAILearningPathIndex(faiss_vectorstore)\n","    answer = genAIproject.get_response_for(query)\n","    end_time = time.time()\n","    print(f\"\\nQuery processing finish time: {get_formatted_time(end_time)}\")\n","    print(f\"\\nAnswer (took about {end_time - start_time} seconds)\")\n","    return answer"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["if __name__=='__main__':\n","    faiss_vectorstore = load_model()\n","    if running_inside_streamlit():\n","        print(\"\\nStreamlit environment detected. \\nTo run a CLI interactive version just run `python main.py` in the CLI.\\n\")\n","        query_from_stream_list = app()\n","        if query_from_stream_list:\n","            answer = query_gpt_model(query_from_stream_list)\n","            st.write(answer)\n","    else:\n","        print(\"\\nCommand-line interactive environment detected.\\n\")\n","        while True:\n","            query = input(\"\\nEnter a query: \")\n","            if query == \"exit\":\n","                break\n","            if query.strip() == \"\":\n","                continue\n","            if query:\n","                answer = query_gpt_model(query)\n","                print(\"\\n\\n> Question:\")\n","                print(query)\n","                print(answer)"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.4"}},"nbformat":4,"nbformat_minor":2}
2 | 


--------------------------------------------------------------------------------
/app/llm-poc-variant-01/README.md:
--------------------------------------------------------------------------------
  1 | # lpiGPT - Learning Path Index GPT
  2 | 
  3 | Ever thought you could ask/query a GPT about a course or smaller module of a course and have it find such bits of learning material across multiple sources of courses.
  4 | 
  5 | A standalone GPT app based on [Ollama](https://github.com/jmorganca/ollama) and the [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset).
  6 | 
  7 | It's simple and runs on the local machine with smaller sized and free LLMs.
  8 | 
  9 | > Note: credits to this program goes to the original authors of [langchain-python-rag-privategpt](https://github.com/jmorganca/ollama/tree/main/examples/langchain-python-rag-privategpt) from Ivan Martinez who contributed to an example on [jmorganca/ollama](https://github.com/jmorganca/ollama).
 10 | 
 11 | 
 12 | ## Table of Contents
 13 | 
 14 | - [Requirements](#requirements)
 15 | - [Installation](#installation)
 16 | - [Setup](#setup)
 17 |    - [Downloading Learning Path Index datasets](#downloading-learning-path-index-datasets)
 18 |    - [Ingesting files](#ingesting-files)
 19 |      - [via native shell CLI](#via-native-shell-cli)
 20 | - [Usage](#usage)
 21 |   - [Ask questions](#ask-questions)
 22 |     - [via native shell CLI](#via-native-shell-cli-1)
 23 |     - [via Docker container](#via-docker-container)
 24 |   - [Try a different model](#try-a-different-model)
 25 |   - [Adding more files](#adding-more-files)
 26 | - [Models](#models)
 27 |   - [Embeddings models](#embeddings-models)
 28 |   - [Chat models](#chat-models)
 29 | - [Known issues](#known-issues)
 30 | - [Contributing](#contributing)
 31 | - [License](#license)
 32 | 
 33 | ## Requirements
 34 | 
 35 | List out the key requirements needed to run the project, such as:
 36 | 
 37 | - System requirements:
 38 |   - Quadcore Intel CPU 2.3Ghz or higher, 16-32GB RAM, 100 GB Free diskspace
 39 |   - Preferrable Linux or macOS
 40 | - Python 3.9
 41 |   - [pyenv](https://github.com/pyenv/pyenv)
 42 |   - or venv
 43 |   - or [pipenv](https://pipenv.pypa.io/en/latest/)
 44 | - Docker (optional)
 45 | - Ollama ([Download & Install(https://ollama.com/download))
 46 | - Windows:
 47 |   -  Microsoft Visual C++ 14.0 or greater is required (needed when installing ```hnswlib``` )
 48 | 
 49 | ## Installation
 50 | 
 51 | Install [Ollama](https://github.com/jmorganca/ollama) using the below command on the host/local machine:
 52 | 
 53 | ```bash
 54 | curl https://ollama.ai/install.sh | sh
 55 | ```
 56 | 
 57 | Pull the model you'd like to use:
 58 | 
 59 | ```shell
 60 | ollama pull llama2-uncensored
 61 | ```
 62 | 
 63 | Set up a virtual environment (or use the [Docker route](#via-docker-container)):
 64 | 
 65 | ```shell
 66 | python3 -m venv .venv
 67 | source .venv/bin/activate
 68 | ```
 69 | 
 70 | Please note there are other options to use as well i.e. Conda, venv, virtualenv, poetry, etc. to isolate your development environments.
 71 | 
 72 | For Windows, download Microsoft Visual C++ 14.0 or greater ([Link](https://visualstudio.microsoft.com/visual-cpp-build-tools/)). During installation, ensure that "Desktop development with C++" is selected.
 73 | 
 74 | Install the Python dependencies:
 75 | 
 76 | ```shell
 77 | pip install -r requirements.txt
 78 | ```
 79 | 
 80 | If you haven't installed Ollama yet, refer to the [Ollama repository](https://github.com/ollama/ollama) for installation instructions.
 81 | 
 82 | Pull the model you'd like to use:
 83 | 
 84 | ```shell
 85 | ollama pull llama2-uncensored
 86 | ```
 87 | 
 88 | and start the Ollama server
 89 | 
 90 | ```shell
 91 | ollama serve
 92 | ```
 93 | 
 94 | 
 95 | ## Setup
 96 | 
 97 | ### Downloading Learning Path Index datasets
 98 | 
 99 | ```bash
100 | mkdir -p source_documents
101 | 
102 | curl https://raw.githubusercontent.com/neomatrix369/learning-path-index/main/data/Courses_and_Learning_Material.csv -o "source_documents/Courses_and_Learning_Material.csv"
103 | 
104 | curl https://raw.githubusercontent.com/neomatrix369/learning-path-index/main/data/Learning_Pathway_Index.csv -o "source_documents/Learning_Pathway_Index.csv"
105 | ```
106 | 
107 | Or you can manually download them from the [Kaggle Dataset: Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset).
108 | 
109 | ### Ingesting files
110 | 
111 | #### via native shell CLI
112 | 
113 | ```shell
114 | python3 ingest.py
115 | ```
116 | 
117 | Output should look like this:
118 | 
119 | ```shell
120 | root@sai-XPS-15-9560:/home# python3 ingest.py
121 | Downloading (…)e9125/.gitattributes: 100%|███████████████████████████████████████████████████████████████████| 1.18k/1.18k [00:00<00:00, 2.07MB/s]
122 | Downloading (…)_Pooling/config.json: 100%|████████████████████████████████████████████████████████████████████████| 190/190 [00:00<00:00, 378kB/s]
123 | Downloading (…)7e55de9125/README.md: 100%|███████████████████████████████████████████████████████████████████| 10.6k/10.6k [00:00<00:00, 16.2MB/s]
124 | Downloading (…)55de9125/config.json: 100%|███████████████████████████████████████████████████████████████████████| 612/612 [00:00<00:00, 1.53MB/s]
125 | Downloading (…)ce_transformers.json: 100%|████████████████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 252kB/s]
126 | Downloading (…)125/data_config.json: 100%|███████████████████████████████████████████████████████████████████| 39.3k/39.3k [00:00<00:00, 29.4MB/s]
127 | Downloading pytorch_model.bin: 100%|█████████████████████████████████████████████████████████████████████████| 90.9M/90.9M [00:09<00:00, 9.11MB/s]
128 | Downloading (…)nce_bert_config.json: 100%|█████████████████████████████████████████████████████████████████████| 53.0/53.0 [00:00<00:00, 97.4kB/s]
129 | Downloading (…)cial_tokens_map.json: 100%|████████████████████████████████████████████████████████████████████████| 112/112 [00:00<00:00, 698kB/s]
130 | Downloading (…)e9125/tokenizer.json: 100%|█████████████████████████████████████████████████████████████████████| 466k/466k [00:00<00:00, 5.22MB/s]
131 | Downloading (…)okenizer_config.json: 100%|████████████████████████████████████████████████████████████████████████| 350/350 [00:00<00:00, 627kB/s]
132 | Downloading (…)9125/train_script.py: 100%|███████████████████████████████████████████████████████████████████| 13.2k/13.2k [00:00<00:00, 21.1MB/s]
133 | Downloading (…)7e55de9125/vocab.txt: 100%|█████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 10.7MB/s]
134 | Downloading (…)5de9125/modules.json: 100%|████████████████████████████████████████████████████████████████████████| 349/349 [00:00<00:00, 721kB/s]
135 | Creating new vectorstore
136 | Loading documents from source_documents
137 | Loading new documents: 100%|██████████████████████| 2/2 [00:00<00:00, 40.44it/s]
138 | Loaded 1414 new documents from source_documents
139 | Split into 2214 chunks of text (max. 500 tokens each)
140 | Creating embeddings. May take some minutes...
141 | Ingestion complete! You can now run lpiGPT.py to query your documents
142 | ```
143 | 
144 | ```bash
145 | usage: ingest.py [-h] [--embeddings-model-name EMBEDDINGS_MODEL_NAME] [--source-documents SOURCE_DOCUMENTS] [--persist-directory PERSIST_DIRECTORY]
146 |                  [--target-source-chunks TARGET_SOURCE_CHUNKS] [--chunk-overlap CHUNK_OVERLAP]
147 | 
148 | ingest: ingest: process one or more documents (text) in order to create embeddings (using the Embeddings models) from them, and make them ready to be used with LLMs when a question is asked to the InstructGPT or Chat Model.
149 | 
150 | optional arguments:
151 |   -h, --help            show this help message and exit
152 |   --embeddings-model-name EMBEDDINGS_MODEL_NAME, -EM EMBEDDINGS_MODEL_NAME
153 |                         Use this flag to set the Embeddings model name, see https://www.sbert.net/docs/pretrained_models.html for examples of names. Use the same model
154 |                         when running the lpiGPT.py app.
155 |   --source-documents SOURCE_DOCUMENTS, -S SOURCE_DOCUMENTS
156 |                         Use this flag to specify the name of the folder where all the (source/input) documents are stored for ingestion purposes, on the local machine. The
157 |                         documents contained in them are of the type `.csv`.
158 |   --persist-directory PERSIST_DIRECTORY, -P PERSIST_DIRECTORY
159 |                         Use this flag to specify the name of the vector database, this will be a folder on the local machine.
160 |   --target-source-chunks TARGET_SOURCE_CHUNKS, -C TARGET_SOURCE_CHUNKS
161 |                         Use this flag to specify the name chunk size to use to chunk source data.
162 |   --chunk-overlap CHUNK_OVERLAP, -O CHUNK_OVERLAP
163 |                         Use this flag to specify the name chunk overlap value to use to chunk source data.
164 | ```
165 | 
166 | #### Known issues
167 | 
168 | - When trying to ingest and also run the GPT app, we can get this error on system with Python 3.10 or older
169 | 
170 | ```python
171 | RuntimeError: Your system has an unsupported version of sqlite3. Chroma requires sqlite3 >= 3.35.0.
172 | ```
173 | 
174 | If this occurs then use the Docker container to run your commands, instructions are given below under each sub-section.
175 | 
176 | [back to ToC](#table-of-contents)
177 | 
178 | ## Usage
179 | 
180 | ### Ask questions
181 | 
182 | #### via native shell CLI
183 | 
184 | Before running ```lpiGPT.py```you need to specify the base URL for the Ollama API or the local instance of Ollama running on your machine. By default this will return a ```None``` value.
185 | 
186 | - Windows:
187 |   - This is typically http://localhost:11434 and can be set by using the following in command line:
188 | ```shell
189 | set OLLAMA_HOST=http://localhost:11434
190 | ```
191 | 
192 | ```shell
193 | python3 lpiGPT.py
194 | 
195 | Enter a query: Fetch me all machine learning courses of the advanced level from the Learning Path Index and show me results in a tabular form
196 | 
197 | Start time: 2023-10-07 16:14:18
198 | > Question:
199 | Fetch me all machine learning courses of the advanced level from the Learning Path Index and show me results in a tabular form
200 | End time: 2023-10-07 16:17:19
201 | Answer (took about 181.3118166923523 seconds):
202 | | Course Name | Level | Type | Duration | Module / Sub-module | Keywords/Tags/Skills/Interests/Categories | Links |
203 | |-------------------------------|--------|-------|----------|--------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------|
204 | 1. Machine Learning Engineer Learning Path | Intermediate to Advanced | Free during mentorship period | AI Foundations: Quiz | Machine Learning/ Cloud/Data/Infrastructure/Bigquery/| https://www.cloudskillsboost.google/course_sessions/4968855/quizzes/387518
205 | 2. Machine Learning Engineer Learning Path | Intermediate to Advanced | Free during mentorship period | AI Development Workflow: Quiz | AI/Development/API/Vertex AI/MLOps/Workflow| https://www.cloudskillsboost.google/course_sessions/4968855/quizzes/387541
206 | 3. Machine Learning Engineer Learning Path | Intermediate to Advanced | Free during mentorship period | AI Development Options: Quiz | AI/Development/API/Vertex AI/AutoML/Workflow| https://www.cloudskillsboost.google/course_sessions/4968855/quizzes/387529
207 | 4. Machine Learning Engineer Learning Path | Intermediate to Advanced | Free during mentorship period | BigQuery Machine Learning: Develop ML Models Where Your Data Lives: Introduction | Big Query/Explanable AI/ML models/Hyperparameter. tuning/recommendation system| https://www.cloudskillsboost.google/course_sessions/4968855/quizzes/387530
208 | Note: The results will be displayed in a table format with columns for Course Name, Level, Type, Duration, Module / Sub-module, Keywords/Tags/Skills/Interests/Categories and Links.
209 | 
210 | .
211 | .
212 | .
213 | [A list of source documents it got the results from]
214 | .
215 | .
216 | .
217 | ```
218 | 
219 | To exit the GPT prompt, press Ctrl-C or Ctrl-D and it will return to the Linux/Command-prompt.
220 | 
221 | 
222 | ```bash                                                             
223 | > python3 lpiGPT.py --help
224 | usage: lpiGPT.py [-h] [--chat-model CHAT_MODEL] [--embeddings-model-name EMBEDDINGS_MODEL_NAME] [--persist-directory PERSIST_DIRECTORY]
225 |                  [--target-source-chunks TARGET_SOURCE_CHUNKS] [--hide-source] [--mute-stream]
226 | 
227 | lpiGPT: Ask questions to your documents without an internet connection, the power of LLMs (the InstructGPT or Chat model).
228 | 
229 | optional arguments:
230 |   -h, --help            show this help message and exit
231 |   --chat-model CHAT_MODEL, -CM CHAT_MODEL
232 |                         Use this flag to set the InstructGPT or Chat model name, see https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard or
233 |                         https://ollama.ai/library for more names.
234 |   --embeddings-model-name EMBEDDINGS_MODEL_NAME, -EM EMBEDDINGS_MODEL_NAME
235 |                         Use this flag to set the Embeddings model name, see https://www.sbert.net/docs/pretrained_models.html for examples of names. Use the same model as
236 |                         used for ingesting the documents (ingest.py)
237 |   --persist-directory PERSIST_DIRECTORY, -P PERSIST_DIRECTORY
238 |                         Use this flag to specify the name of the vector database, this will be a folder on the local machine.
239 |   --target-source-chunks TARGET_SOURCE_CHUNKS, -C TARGET_SOURCE_CHUNKS
240 |                         Use this flag to specify the name chunk size to use to chunk source data.
241 |   --hide-source, -S     Use this flag to disable printing of source documents used for answers.
242 |   --mute-stream, -M     Use this flag to disable the streaming StdOut callback for LLMs.
243 | ```
244 | 
245 | #### via Docker container
246 | 
247 | You can also setup an isolated environment i.e. inside Docker container and perform the same above operations
248 | 
249 | ```shell
250 | cd docker
251 | ./build-docker-image.sh
252 | ```
253 | 
254 | when finished with building the container run the below
255 | 
256 | ```shell
257 | ./run-docker-container.sh
258 | ```
259 | 
260 | you will get a prompt like this:
261 | 
262 | ```shell
263 | root@[your machine name]:/home#:
264 | ```
265 | 
266 | in there, type the same commands as in the **via native shell CLI** sections of [Ingesting files](#ingesting-files) and [Ask questions](#ask-questions) respectively.
267 | 
268 | 
269 | ### Try a different model
270 | 
271 | ```shell
272 | ollama pull llama2:13b
273 | python3 lpiGPT.py --chat-model=llama2:13b
274 | ```
275 | 
276 | ### Adding more files
277 | 
278 | Put any and all your files into the `source_documents` directory
279 | 
280 | The supported extensions are:
281 | 
282 | - `.csv`: CSV
283 | and others, we have trimmed them off from here to keep this example simple and concise.
284 | 
285 | [back to ToC](#table-of-contents)
286 | 
287 | ## Models
288 | 
289 | ### Embeddings models
290 | 
291 | For embeddings model, the example uses a sentence-transformers model https://www.sbert.net/docs/pretrained_models.html
292 | The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2` is 5 times faster and still offers good quality.
293 | 
294 | ### Chat models
295 | 
296 | For chat models, have a look at [this list](https://github.com/jmorganca/ollama/#model-library) on [Ollama's github repo](https://github.com/jmorganca/ollama/). The list is basic, hence other LLM resources must be consulted i.e.
297 | 
298 | - [Kaggle models](https://www.kaggle.com/models?query=LLM)
299 | - [HuggingFace models](https://huggingface.co/models?other=LLM)
300 | - ...(others)..
301 | 
302 | _Please share your resources on either or both of the Embeddings and Chat models with us_
303 | 
304 | ## Contributing
305 | 
306 | We are open to any or all of the below from your side in terms of contributions:
307 | 
308 |     - Reporting issues
309 |     - Submitting pull requests
310 |     - Coding standards or guidelines
311 |     - Testing requirements
312 | 
313 | ## License
314 | 
315 | See [LICENSE](https://github.com/neomatrix369/learning-path-index/blob/main/LICENSE) in the root folder of the project
316 | 
317 | [back to ToC](#table-of-contents)
318 | 


--------------------------------------------------------------------------------
/data/dataset-metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "errorMessageNullable": null,
  3 |   "info": {
  4 |     "datasetSlugNullable": "learning-path-index-dataset",
  5 |     "ownerUserNullable": "neomatrix369",
  6 |     "usabilityRatingNullable": 1.0,
  7 |     "titleNullable": "Learning Path Index Dataset",
  8 |     "subtitleNullable": "A comprehensive dataset of Data Science, ML and AI learning paths and courses",
  9 |     "descriptionNullable": "# Description\nThe **Learning Path Index Dataset** is a comprehensive collection of byte-sized courses and learning materials tailored for individuals eager to delve into the fields of Data Science, Machine Learning, and Artificial Intelligence (AI), making it an indispensable reference for students, professionals, and educators in the Data Science and AI communities.\n\nThis **Kaggle Dataset** along with the KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) were created by the mentors and mentees of **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between _August 2023_ and _November 2023_, also [see this](https://www.kaggle.com/discussions/general/409607)). See **Credits** section at the bottom of the long description.\n\n# Inspiration\nThis dataset was created out of a commitment to facilitate learning and growth within the Data Science, Machine Learning, and AI communities. It started off as an idea at the end of **Cohort 2** of the [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) brainstorming and feedback session. It was one of the ideas to create byte-sized learning material to help our KaggleX mentees learn things faster. It aspires to simplify the process of finding, evaluating, and selecting the most fitting educational resources.\n\n# Context\nThis dataset was meticulously curated to assist learners in navigating the vast landscape of Data Science, Machine Learning, and AI education. It serves as a compass for those aiming to develop their skills and expertise in these rapidly evolving fields. \n\nThe mentors and mentees communicated via **Discord**, **Trello**, **Google Hangout**, etc... to put together these artifacts and made them public for everyone to _use and contribute back_.\n\n# Sources\nThe dataset compiles data from a curated selection of reputable sources including leading educational platforms such as **Google Developer, Google Cloud Skill Boost, IBM, Fast AI**, etc. By drawing from these trusted sources, we ensure that the data is both accurate and pertinent. The raw data and other artifacts as a result of this exercise can be found on the GitHub Repo i.e. KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index).\n\n# Content\nThe dataset encompasses the following attributes:\n\n- **Course / Learning Material:** The title of the Data Science, Machine Learning, or AI course or learning material.\n- **Source:** The provider or institution offering the course.\n- **Course Level:** The proficiency level, ranging from Beginner to Advanced.\n- **Type (Free or Paid):** Indicates whether the course is available for free or requires payment.\n- **Module:** Specific module or section within the course.\n- **Duration:** The estimated time required to complete the module or course.\n- **Module / Sub-module Difficulty Level:** The complexity level of the module or sub-module.\n- **Keywords / Tags / Skills / Interests / Categories:** Relevant keywords, tags, or categories associated with the course with a focus on Data Science, Machine Learning, and AI.\n- **Links:** Hyperlinks to access the course or learning material directly.\n\n# How to contribute to this initiative?\n\n- You can also join us by taking part in the next [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607))\n- Keep your eyes open on the **Kaggle Discussions** page and other **KaggleX** social media channels. Or find us on the [Kaggle Discord](https://www.kaggle.com/discussions/general/429933) channel to learn more about the next steps\n- Create notebooks from this data\n- Create supplementary or complementary data for or from this dataset\n- Submit corrections/enhancements or anything else to help improve this dataset so it has a wider use and purpose\n\n# License\nThe **Learning Path Index Dataset** is openly shared under a permissive license, allowing users to utilize the data for educational, analytical, and research purposes within the Data Science, Machine Learning, and AI domains. Feel free to _fork the dataset_ and make it your own, we would be delighted if you contributed back to the dataset and/or our KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) as well.\n\n# Important Links\n\n- [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607))\n- KaggleX [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset)\n- KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index)\n- [New Official Kaggle Discord Server!](https://www.kaggle.com/discussions/general/429933)\n\n# Credits\nCredits for all the work done to create this Kaggle Dataset and the KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) goes to these mentors and mentees (in no particular order): [Manish Kumar](https://www.kaggle.com/manishkr1754), [Ben Aji](https://www.kaggle.com/benajii) (_mentor_), [Emmanuel Katchy](https://www.kaggle.com/tobetek), [Ezeogu Ernest](https://www.kaggle.com/tobetek), [Manish](https://www.kaggle.com/manish5), [Mustafa](https://www.kaggle.com/mustafa254), [Nnamdi Idowu-Anifowoshe](https://www.kaggle.com/idowuchukwudi), [Sheba Alkali](https://www.kaggle.com/shebaalkali), [Zainab ikeoluwa](https://www.kaggle.com/zainabikeoluwa), [Wendy Mak](https://www.kaggle.com/wwymak) (_mentor_), [Misirya Hameed](https://www.linkedin.com/in/misiriya-shahul-hameed-b3957875) (_mentor_), [Chukwuebuka Obi](https://www.kaggle.com/chukwuebukaobi), [Victor Umunna](https://www.kaggle.com/victorumunna), [Pui Yueng](https://www.kaggle.com/lorentzyeung), [Afolake Solomon](https://www.kaggle.com/flakkyddon), [Faith Osoro](https://www.kaggle.com/faithosoro), [Chukwudi Idowu](https://www.kaggle.com/chukwudiidowu), and many others who were part of the **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between _August 2023_ and _November 2023_, also [see this](https://www.kaggle.com/discussions/general/409607)).\n\nOur gratitude also goes to our silent supporters of this initiative from organisers to the mentors and mentees whose help and support kept us going.\n\n_**Note:** In case your name or mention is missed out in the above list, then please let us know._",
 10 |     "datasetId": 3766406,
 11 |     "datasetSlug": "learning-path-index-dataset",
 12 |     "hasDatasetSlug": true,
 13 |     "ownerUser": "neomatrix369",
 14 |     "hasOwnerUser": true,
 15 |     "usabilityRating": 1.0,
 16 |     "hasUsabilityRating": true,
 17 |     "totalViews": 1779,
 18 |     "totalVotes": 32,
 19 |     "totalDownloads": 226,
 20 |     "title": "Learning Path Index Dataset",
 21 |     "hasTitle": true,
 22 |     "subtitle": "A comprehensive dataset of Data Science, ML and AI learning paths and courses",
 23 |     "hasSubtitle": true,
 24 |     "description": "# Description\nThe **Learning Path Index Dataset** is a comprehensive collection of byte-sized courses and learning materials tailored for individuals eager to delve into the fields of Data Science, Machine Learning, and Artificial Intelligence (AI), making it an indispensable reference for students, professionals, and educators in the Data Science and AI communities.\n\nThis **Kaggle Dataset** along with the KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) were created by the mentors and mentees of **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between _August 2023_ and _November 2023_, also [see this](https://www.kaggle.com/discussions/general/409607)). See **Credits** section at the bottom of the long description.\n\n# Inspiration\nThis dataset was created out of a commitment to facilitate learning and growth within the Data Science, Machine Learning, and AI communities. It started off as an idea at the end of **Cohort 2** of the [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) brainstorming and feedback session. It was one of the ideas to create byte-sized learning material to help our KaggleX mentees learn things faster. It aspires to simplify the process of finding, evaluating, and selecting the most fitting educational resources.\n\n# Context\nThis dataset was meticulously curated to assist learners in navigating the vast landscape of Data Science, Machine Learning, and AI education. It serves as a compass for those aiming to develop their skills and expertise in these rapidly evolving fields. \n\nThe mentors and mentees communicated via **Discord**, **Trello**, **Google Hangout**, etc... to put together these artifacts and made them public for everyone to _use and contribute back_.\n\n# Sources\nThe dataset compiles data from a curated selection of reputable sources including leading educational platforms such as **Google Developer, Google Cloud Skill Boost, IBM, Fast AI**, etc. By drawing from these trusted sources, we ensure that the data is both accurate and pertinent. The raw data and other artifacts as a result of this exercise can be found on the GitHub Repo i.e. KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index).\n\n# Content\nThe dataset encompasses the following attributes:\n\n- **Course / Learning Material:** The title of the Data Science, Machine Learning, or AI course or learning material.\n- **Source:** The provider or institution offering the course.\n- **Course Level:** The proficiency level, ranging from Beginner to Advanced.\n- **Type (Free or Paid):** Indicates whether the course is available for free or requires payment.\n- **Module:** Specific module or section within the course.\n- **Duration:** The estimated time required to complete the module or course.\n- **Module / Sub-module Difficulty Level:** The complexity level of the module or sub-module.\n- **Keywords / Tags / Skills / Interests / Categories:** Relevant keywords, tags, or categories associated with the course with a focus on Data Science, Machine Learning, and AI.\n- **Links:** Hyperlinks to access the course or learning material directly.\n\n# How to contribute to this initiative?\n\n- You can also join us by taking part in the next [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607))\n- Keep your eyes open on the **Kaggle Discussions** page and other **KaggleX** social media channels. Or find us on the [Kaggle Discord](https://www.kaggle.com/discussions/general/429933) channel to learn more about the next steps\n- Create notebooks from this data\n- Create supplementary or complementary data for or from this dataset\n- Submit corrections/enhancements or anything else to help improve this dataset so it has a wider use and purpose\n\n# License\nThe **Learning Path Index Dataset** is openly shared under a permissive license, allowing users to utilize the data for educational, analytical, and research purposes within the Data Science, Machine Learning, and AI domains. Feel free to _fork the dataset_ and make it your own, we would be delighted if you contributed back to the dataset and/or our KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) as well.\n\n# Important Links\n\n- [KaggleX BIPOC Mentorship program](https://www.kaggle.com/kagglex) (also [see this](https://www.kaggle.com/discussions/general/409607))\n- KaggleX [Learning Path Index Dataset](https://www.kaggle.com/datasets/neomatrix369/learning-path-index-dataset)\n- KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index)\n- [New Official Kaggle Discord Server!](https://www.kaggle.com/discussions/general/429933)\n\n# Credits\nCredits for all the work done to create this Kaggle Dataset and the KaggleX [Learning Path Index GitHub Repo](https://github.com/neomatrix369/learning-path-index) goes to these mentors and mentees (in no particular order): [Manish Kumar](https://www.kaggle.com/manishkr1754), [Ben Aji](https://www.kaggle.com/benajii) (_mentor_), [Emmanuel Katchy](https://www.kaggle.com/tobetek), [Ezeogu Ernest](https://www.kaggle.com/tobetek), [Manish](https://www.kaggle.com/manish5), [Mustafa](https://www.kaggle.com/mustafa254), [Nnamdi Idowu-Anifowoshe](https://www.kaggle.com/idowuchukwudi), [Sheba Alkali](https://www.kaggle.com/shebaalkali), [Zainab ikeoluwa](https://www.kaggle.com/zainabikeoluwa), [Wendy Mak](https://www.kaggle.com/wwymak) (_mentor_), [Misirya Hameed](https://www.linkedin.com/in/misiriya-shahul-hameed-b3957875) (_mentor_), [Chukwuebuka Obi](https://www.kaggle.com/chukwuebukaobi), [Victor Umunna](https://www.kaggle.com/victorumunna), [Pui Yueng](https://www.kaggle.com/lorentzyeung), [Afolake Solomon](https://www.kaggle.com/flakkyddon), [Faith Osoro](https://www.kaggle.com/faithosoro), [Chukwudi Idowu](https://www.kaggle.com/chukwudiidowu), and many others who were part of the **Cohort 3** [KaggleX BIPOC Mentorship Program](https://www.kaggle.com/kagglex) (between _August 2023_ and _November 2023_, also [see this](https://www.kaggle.com/discussions/general/409607)).\n\nOur gratitude also goes to our silent supporters of this initiative from organisers to the mentors and mentees whose help and support kept us going.\n\n_**Note:** In case your name or mention is missed out in the above list, then please let us know._",
 25 |     "hasDescription": true,
 26 |     "isPrivate": false,
 27 |       "keywords": [
 28 |       "education",
 29 |       "artificial intelligence",
 30 |       "computer science",
 31 |       "programming",
 32 |       "beginner"
 33 |     ],
 34 |     "licenses": [
 35 |       {
 36 |         "nameNullable": "Apache 2.0",
 37 |         "name": "Apache 2.0",
 38 |         "hasName": true
 39 |       }
 40 |     ],
 41 |     "collaborators": [
 42 |       {
 43 |         "username": "manish5",
 44 |         "role": "reader"
 45 |       },
 46 |       {
 47 |         "username": "mustafa254",
 48 |         "role": "reader"
 49 |       },
 50 |       {
 51 |         "username": "benajii",
 52 |         "role": "reader"
 53 |       },
 54 |       {
 55 |         "username": "zainabikeoluwa",
 56 |         "role": "reader"
 57 |       },
 58 |       {
 59 |         "username": "shebaalkali",
 60 |         "role": "reader"
 61 |       },
 62 |       {
 63 |         "username": "ernestdatascience",
 64 |         "role": "reader"
 65 |       },
 66 |       {
 67 |         "username": "idowuchukwudi",
 68 |         "role": "reader"
 69 |       },
 70 |       {
 71 |         "username": "tobetek",
 72 |         "role": "reader"
 73 |       },
 74 |       {
 75 |         "username": "manishkr1754",
 76 |         "role": "writer"
 77 |       }
 78 |     ],
 79 |     "data": [
 80 |           {
 81 |             "path": "Learning_Pathway_Index.csv",
 82 |             "description": "This file contains information about Data Science, Machine Learning, and AI courses and learning materials.",
 83 |             "schema": {
 84 |               "fields": [
 85 |                 {
 86 |                   "name": "Module_Code",
 87 |                   "description": "The course code of the course or learning material.",
 88 |                   "type": "string"
 89 |                 },                
 90 |                 {
 91 |                   "name": "Course_Learning_Material",
 92 |                   "description": "The title of the course or learning material.",
 93 |                   "type": "string"
 94 |                 },
 95 |                 {
 96 |                   "name": "Source",
 97 |                   "description": "The provider or institution offering the course.",
 98 |                   "type": "string"
 99 |                 },
100 |                 {
101 |                   "name": "Course_Level",
102 |                   "description": "The proficiency level, ranging from Beginner to Advanced.",
103 |                   "type": "string"
104 |                 },
105 |                 {
106 |                   "name": "Type_Free_Paid",
107 |                   "description": "Indicates whether the course is available for free or requires payment.",
108 |                   "type": "string"
109 |                 },
110 |                 {
111 |                   "name": "Module",
112 |                   "description": "Specific module or section within the course.",
113 |                   "type": "string"
114 |                 },
115 |                 {
116 |                   "name": "Duration",
117 |                   "description": "The estimated time required to complete the module or course.",
118 |                   "type": "float"
119 |                 },                
120 |                 {
121 |                   "name": "Difficulty_Level",
122 |                   "description": "The complexity level of the module or sub-module.",
123 |                   "type": "string"
124 |                 },
125 |                 {
126 |                   "name": "Keywords_Tags_Skills_Interests_Categories",
127 |                   "description": "Relevant keywords, tags, or categories associated with the course with a focus on Data Science, Machine Learning, and AI.",
128 |                   "type": "string"
129 |                 },
130 |                 {
131 |                   "name": "Links",
132 |                   "description": "Hyperlinks to access the course or learning material directly.",
133 |                   "type": "string"
134 |                 }
135 |               ]
136 |             }
137 |           },
138 |           {
139 |             "path": "Courses_and_Learning_Material.csv",
140 |             "description": "This file contains information about Data Science, Machine Learning, and AI courses and learning materials.",
141 |             "schema": {
142 |               "fields": [
143 |                 {
144 |                   "name": "Module_Code",
145 |                   "description": "The course code of the course or learning material.",
146 |                   "type": "string"
147 |                 },
148 |                 {
149 |                   "name": "Source",
150 |                   "description": "The provider or institution offering the course.",
151 |                   "type": "string"
152 |                 },
153 |                 {
154 |                   "name": "Course_Level",
155 |                   "description": "The proficiency level, ranging from Beginner to Advanced.",
156 |                   "type": "string"
157 |                 },
158 |                 {
159 |                   "name": "Duration",
160 |                   "description": "The estimated time required to complete the module or course.",
161 |                   "type": "string"
162 |                 },
163 |                 {
164 |                   "name": "Prerequisites",
165 |                   "description": "One or more courses that need to be completed before a learner can enroll in or take the current course.",
166 |                   "type": "string"
167 |                 },
168 |                 {
169 |                   "name": "Prework",
170 |                   "description": "Foundational knowledge or skills required for successful engagement with the course material.",
171 |                   "type": "string"
172 |                 },
173 |                 {
174 |                   "name": "Course_Learning_Material",
175 |                   "description": "Course Title/Name of the Data Science, Machine Learning, or AI course or learning material.",
176 |                   "type": "string"
177 |                 },
178 |                 {
179 |                   "name": "Course_Learning_Material_Link",
180 |                   "description": "Hyperlinks to access the course or learning material directly.",
181 |                   "type": "string"
182 |                 },
183 |                 {
184 |                   "name": "Type_Free_Paid",
185 |                   "description": "Indicates whether the course is available for free or requires payment.",
186 |                   "type": "string"
187 |                 }
188 |               ]
189 |             }
190 |           }
191 |         ],
192 |   },
193 |   "errorMessage": "",
194 |   "hasErrorMessage": false
195 | }


--------------------------------------------------------------------------------
/data/Courses_and_Learning_Material.csv:
--------------------------------------------------------------------------------
  1 | Module_Code,Source,Course_Level,Duration,Prerequisites,Prework,Course_Learning_Material,Course_Learning_Material_Link,Type_Free_Paid
  2 | CLMML00,Google Developers,Beginners,70 minutes,No,No,Introduction to Machine Learning,https://developers.google.com/machine-learning/intro-to-ml,Free
  3 | CLMML01,Google Developers,Beginners to Intermediate,,"Yes, a handful ",Yes,Machine Learning Crash Course (Foundation),https://developers.google.com/machine-learning/crash-course,Free
  4 | CLMML02,Google Developers,Beginners to Intermediate,45 minutes,No,No,Problem Framing (ML related),https://developers.google.com/machine-learning/problem-framing,Free
  5 | CLMML03,Google Developers,Beginners to Intermediate,,No,No,Data Preparation and Feature Engineering in ML,https://developers.google.com/machine-learning/data-prep,Free
  6 | CLMML04,Google Developers,Beginners to Intermediate,,Yes,No,Testing and Debugging,https://developers.google.com/machine-learning/testing-debugging,Free
  7 | CLMML05,Google Developers,Intermediate to Advanced,,No,No,Decision Forests,https://developers.google.com/machine-learning/decision-forests,Free
  8 | CLMML06,Google Developers,Intermediate to Advanced,,No,No,Recommendation Systems,https://developers.google.com/machine-learning/recommendation,Free
  9 | CLMML07,Google Developers,Intermediate to Advanced,,No,No,Clustering,https://developers.google.com/machine-learning/clustering,Free
 10 | CLMML08,Google Developers,Intermediate to Advanced,,No,No,Generative Adversarial Networks,https://developers.google.com/machine-learning/gan,Free
 11 | CLMML09,Google Developers,Intermediate to Advanced,,No,No,Image Classification,https://developers.google.com/machine-learning/practica/image-classification,Free
 12 | CLMML10,Google Developers,Intermediate to Advanced,,No,No,Fairness in Perspective API,https://developers.google.com/machine-learning/practica/fairness-indicators,Free
 13 | CLMF001,Fast.ai,Intermediate to Advanced,,Yes,Varies,Fast.ai,https://course.fast.ai/,Free
 14 | CLMAIE1,IBM,Beginners to Intermediate,,No,No,AI Ethics Resources,https://www.ibm.com/topics/ai-ethics,Free
 15 | CLMML10,Google Developers,Intermediate to Advanced,,Yes,Yes,ML Engineer / Data Scientist Google Cloud Learning Path,https://docs.google.com/presentation/d/18dV09U9JqqB01RbMrfwRUB3LkSAu5L8R/edit#slide=id.p1,Free
 16 | CLMML11,Google Cloud Skills Boost,Intermediate to Advanced,,Yes,Yes,Machine Learning Engineer Learning Path,https://www.cloudskillsboost.google/paths/17,Free
 17 | CLMML12,Google Developers,Intermediate to Advanced,,Yes,Yes,Machine Learning Advance Courses,https://developers.google.com/machine-learning/advanced-courses,Free
 18 | CLMK001,Kaggle Learn,Beginners to Intermediate,,No,No,Kaggle:Courses,https://www.kaggle.com/learn,Free
 19 | CLMG001,Google Cloud Skills Boost,Intermediate,2 days,No,No,Data Engineer - Preparing for the Google Cloud Professional Data Engineer Exam,https://www.cloudskillsboost.google/course_templates/72?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346376,Free during the mentorship program
 20 | CLMG002,Google Cloud Skills Boost,Beginners,1 day,No,No,Data Engineer - Google Cloud Big Data and Machine Learning Fundamentals,https://www.cloudskillsboost.google/course_templates/3?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346323,Free during the mentorship program
 21 | CLMG003,Google Cloud Skills Boost,Beginners,2 days,No,No,Data Engineer - Modernizing Data Lakes and Data Warehouses with Google Cloud,https://www.cloudskillsboost.google/course_templates/54?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346330,Free during the mentorship program
 22 | CLMG004,Google Cloud Skills Boost,Beginners,3 days,No,No,Data Engineer - Building Batch Data Pipelines on Google Cloud,https://www.cloudskillsboost.google/course_templates/53?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346338,Free during the mentorship program
 23 | CLMG005,Google Cloud Skills Boost,Beginners,3 days,No,No,Data Engineer - Building Resilient Streaming Analytics Systems on Google Cloud,https://www.cloudskillsboost.google/course_templates/52?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346347,Free during the mentorship program
 24 | CLMG006,Google Cloud Skills Boost,Beginners,5 days,No,No,"Data Engineer - Smart Analytics, Machine Learning, and AI on Google Cloud",https://www.cloudskillsboost.google/course_templates/55?catalog_rank=%7B%22rank%22%3A2%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346351,Free during the mentorship program
 25 | CLMG007,Google Cloud Skills Boost,Intermediate,,No,No,Data Engineer - Serverless Data Processing with Dataflow: Foundations,https://www.cloudskillsboost.google/course_templates/218?catalog_rank=%7B%22rank%22%3A2%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346355,Free during the mentorship program
 26 | CLMG008,Google Cloud Skills Boost,Advanced,,No,No,Data Engineer - Serverless Data Processing with Dataflow: Develop Pipelines,https://www.cloudskillsboost.google/course_templates/229?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346358,Free during the mentorship program
 27 | CLMG009,Google Cloud Skills Boost,Advanced,,No,No,Data Engineer - Serverless Data Processing with Dataflow: Operations,https://www.cloudskillsboost.google/course_templates/264?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346360,Free during the mentorship program
 28 | CLMG010,Google Cloud Skills Boost,Beginners,45 minutes,No,No,Data Engineer - Lab: A Tour of Google Cloud Hands-on Labs,https://www.cloudskillsboost.google/focuses/2794?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&parent=catalog&search_id=25346362,Free during the mentorship program
 29 | CLMG011,Google Cloud Skills Boost,Beginners,1 hour,No,No,Data Engineer - Lab: Engineer Data in Google Cloud,https://www.cloudskillsboost.google/focuses/12379?catalog_rank=%7B%22rank%22%3A4%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&parent=catalog&search_id=25346300,Free during the mentorship program
 30 | CLMG012,Google Cloud Skills Boost,Beginners,7 hours,No,No,"Data Engineer - Quest: Perform Foundational Data, ML, and AI Tasks in Google Cloud",https://www.cloudskillsboost.google/quests/117?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346364,Free during the mentorship program
 31 | CLMG013,Google Cloud Skills Boost,Beginners,1 hour 30 minutes,No,No,Data Engineer - Quest: Build and Optimize Data Warehouses with BigQuery,https://www.cloudskillsboost.google/focuses/14341?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&parent=catalog&search_id=25346367,Free during the mentorship program
 32 | CLMG014,Google Cloud Skills Boost,Intermediate,6 hours,No,No,Data Engineer - Quest: Engineer Data in Google Cloud,https://www.cloudskillsboost.google/quests/132?catalog_rank=%7B%22rank%22%3A2%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346300,Free during the mentorship program
 33 | CLMG015,Google Cloud Skills Boost,Intermediate,4 hours,No,No,Data Engineer - Quest: Data Engineer,https://www.cloudskillsboost.google/quests/25?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346300,Free during the mentorship program
 34 | CLMGA01,Google Cloud Skills Boost,Beginner,4 hours,No,No,Introduction to Generative AI Learning Path,https://www.cloudskillsboost.google/journeys/118,Free
 35 | CLMGA02,Google Cloud Skills Boost,Beginner to Intermediate,10 hours,Yes,No,Generative AI for Developers Learning Path,https://www.cloudskillsboost.google/journeys/183,Free
 36 | DLA:ai-for-everyone,DeepLearning.AI,Beginner,6 hours,No,Yes,AI for Everyone,https://www.deeplearning.ai//courses/ai-for-everyone,Free
 37 | DLA:ai-for-medicine-specialization,DeepLearning.AI,Intermediate,3 months,Yes,Yes,AI for Medicine,https://www.deeplearning.ai//courses/ai-for-medicine-specialization,Paid
 38 | DLA:deep-learning-specialization,DeepLearning.AI,Intermediate,6 months,Yes,Yes,Deep Learning Specialization,https://www.deeplearning.ai//courses/deep-learning-specialization,Paid
 39 | DLA:natural-language-processing-specialization,DeepLearning.AI,Intermediate,4 months,Yes,Yes,Natural Language Processing,https://www.deeplearning.ai//courses/natural-language-processing-specialization,Paid
 40 | DLA:tensorflow-developer-professional-certificate,DeepLearning.AI,Intermediate,2 - 4 weeks,No,Yes,TensorFlow Developer Professional Certificate,https://www.deeplearning.ai//courses/tensorflow-developer-professional-certificate,Paid
 41 | DLA:tensorflow-data-and-deployment-specialization,DeepLearning.AI,Intermediate,2 - 6 Months,Yes,Yes,TensorFlow: Data and Deployment,https://www.deeplearning.ai//courses/tensorflow-data-and-deployment-specialization,Paid
 42 | DLA:generative-adversarial-networks-gans-specialization,DeepLearning.AI,Intermediate,3 months,Yes,Yes,Generative Adversarial Networks (GANs),https://www.deeplearning.ai//courses/generative-adversarial-networks-gans-specialization,Paid
 43 | DLA:tensorflow-advanced-techniques-specialization,DeepLearning.AI,Intermediate,5 months,Yes,No,TensorFlow: Advanced Techniques,https://www.deeplearning.ai//courses/tensorflow-advanced-techniques-specialization,Paid
 44 | DLA:machine-learning-specialization,"DeepLearning.AI,Stanford Online",Beginner,2.5 Months,Yes,Yes,Machine Learning Specialization,https://www.deeplearning.ai//courses/machine-learning-specialization,Paid
 45 | DLA:mathematics-for-machine-learning-and-data-science-specialization,DeepLearning.AI,Beginner,,Yes,Yes,Mathematics for Machine Learning and Data Science,https://www.deeplearning.ai//courses/mathematics-for-machine-learning-and-data-science-specialization,Paid
 46 | DLA:ai-for-good,DeepLearning.AI,Beginner,2 - 4 Weeks,Yes,Yes,AI for Good,https://www.deeplearning.ai//courses/ai-for-good,Paid
 47 | DLA:chatgpt-prompt-engineering-for-developers,"OpenAI,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,ChatGPT Prompt Engineering for Developers,https://www.deeplearning.ai//short-courses/chatgpt-prompt-engineering-for-developers,Paid
 48 | DLA:building-systems-with-chatgpt,"OpenAI,DeepLearning.AI",Beginner,1 hour,Yes,Yes,Building Systems with the ChatGPT API,https://www.deeplearning.ai//short-courses/building-systems-with-chatgpt,Paid
 49 | DLA:langchain-for-llm-application-development,"LangChain,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,LangChain for LLM Application Development,https://www.deeplearning.ai//short-courses/langchain-for-llm-application-development,Paid
 50 | DLA:how-diffusion-models-work,DeepLearning.AI,Intermediate,1 Hour,Yes,Yes,How Diffusion Models Work,https://www.deeplearning.ai//short-courses/how-diffusion-models-work,Paid
 51 | DLA:generative-ai-with-llms,"AWS,DeepLearning.AI",Intermediate,,Yes,Yes,Generative AI with LLMs,https://www.deeplearning.ai//courses/generative-ai-with-llms,Paid
 52 | DLA:langchain-chat-with-your-data,"LangChain,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,LangChain: Chat with Your Data,https://www.deeplearning.ai//short-courses/langchain-chat-with-your-data,Paid
 53 | DLA:building-generative-ai-applications-with-gradio,"DeepLearning.AI,Hugging Face",Beginner,1 Hour,Yes,Yes,Building Generative AI Applications with Gradio,https://www.deeplearning.ai//short-courses/building-generative-ai-applications-with-gradio,Paid
 54 | DLA:evaluating-debugging-generative-ai,"Weights & Biases,DeepLearning.AI",Intermediate,1 Hour,Yes,Yes,Evaluating and Debugging Generative AI Models Using Weights and Biases,https://www.deeplearning.ai//short-courses/evaluating-debugging-generative-ai,Paid
 55 | DLA:large-language-models-semantic-search,"Cohere,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,Large Language Models with Semantic Search,https://www.deeplearning.ai//short-courses/large-language-models-semantic-search,Paid
 56 | DLA:finetuning-large-language-models,"Lamini,DeepLearning.AI",Intermediate,1 Hour,Yes,Yes,Finetuning Large Language Models,https://www.deeplearning.ai//short-courses/finetuning-large-language-models,Paid
 57 | DLA:microsoft-semantic-kernel,"Microsoft,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,How Business Thinkers Can Start Building AI Plugins With Semantic Kernel,https://www.deeplearning.ai//short-courses/microsoft-semantic-kernel,Paid
 58 | DLA:google-cloud-vertex-ai,"DeepLearning.AI,Google Cloud",Beginner,1 Hour,Yes,Yes,Understanding and Applying Text Embeddings,https://www.deeplearning.ai//short-courses/google-cloud-vertex-ai,Paid
 59 | DLA:pair-programming-llm,"Google,DeepLearning.AI",Beginner,1 Hour,Yes,Yes,Pair Programming with a Large Language Model,https://www.deeplearning.ai//short-courses/pair-programming-llm,Paid
 60 | DLA:generative-ai-for-everyone,DeepLearning.AI,Beginner,1 Hour,Yes,Yes,Generative AI for Everyone,https://www.deeplearning.ai//courses/generative-ai-for-everyone,Paid
 61 | DLA:functions-tools-agents-langchain,"LangChain,DeepLearning.AI",Intermediate,1 Hour,Yes,Yes,"Functions, Tools and Agents with LangChain",https://www.deeplearning.ai//short-courses/functions-tools-agents-langchain,Paid
 62 | DLA:vector-databases-embeddings-applications,"Weaviate,DeepLearning.AI",Intermediate,,Yes,Yes,Vector Databases: from Embeddings to Applications,https://www.deeplearning.ai//short-courses/vector-databases-embeddings-applications,Paid
 63 | DLA:quality-safety-llm-applications,"WhyLabs,DeepLearning.AI",Beginner,,Yes,Yes,Quality and Safety for LLM Applications,https://www.deeplearning.ai//short-courses/quality-safety-llm-applications,Paid
 64 | DLA:building-evaluating-advanced-rag,"LlamaIndex,DeepLearning.AI,TruEra",Beginner,,Yes,Yes,Building and Evaluating Advanced RAG Applications,https://www.deeplearning.ai//short-courses/building-evaluating-advanced-rag,Paid
 65 | DLA:reinforcement-learning-from-human-feedback,"DeepLearning.AI,Google Cloud",Intermediate,,Yes,Yes,Reinforcement Learning from Human Feedback,https://www.deeplearning.ai//short-courses/reinforcement-learning-from-human-feedback,Paid
 66 | DLA:advanced-retrieval-for-ai,"DeepLearning.AI,Chroma",Intermediate,,Yes,Yes,Advanced Retrieval for AI with Chroma,https://www.deeplearning.ai//short-courses/advanced-retrieval-for-ai,Paid
 67 | DLA:build-llm-apps-with-langchain-js,"LangChain,DeepLearning.AI",Intermediate,,Yes,Yes,Build LLM Apps with LangChain.js,https://www.deeplearning.ai//short-courses/build-llm-apps-with-langchain-js,Paid
 68 | DLA:llmops,"DeepLearning.AI,Google Cloud",Beginner,,Yes,Yes,LLMOps,https://www.deeplearning.ai//short-courses/llmops,Paid
 69 | DLA:automated-testing-llmops,"DeepLearning.AI,CircleCI",Intermediate,,Yes,Yes,Automated Testing for LLMOps,https://www.deeplearning.ai//short-courses/automated-testing-llmops,Paid
 70 | DLA:building-applications-vector-databases,"Pinecone,DeepLearning.AI",Beginner,,Yes,Yes,Building Applications with Vector Databases,https://www.deeplearning.ai//short-courses/building-applications-vector-databases,Paid
 71 | DLA:serverless-llm-apps-amazon-bedrock,"AWS,DeepLearning.AI",Intermediate,,Yes,Yes,Serverless LLM apps with Amazon Bedrock,https://www.deeplearning.ai//short-courses/serverless-llm-apps-amazon-bedrock,Paid
 72 | DLA:prompt-engineering-with-llama-2,"Meta,DeepLearning.AI",Beginner,,Yes,Yes,Prompt Engineering with Llama 2 & 3,https://www.deeplearning.ai//short-courses/prompt-engineering-with-llama-2,Paid
 73 | DLA:open-source-models-hugging-face,"DeepLearning.AI,Hugging Face",Beginner,,Yes,Yes,Open Source Models with Hugging Face,https://www.deeplearning.ai//short-courses/open-source-models-hugging-face,Paid
 74 | DLA:knowledge-graphs-rag,"DeepLearning.AI,Neo4j",Intermediate,,Yes,Yes,Knowledge Graphs for RAG,https://www.deeplearning.ai//short-courses/knowledge-graphs-rag,Paid
 75 | DLA:efficiently-serving-llms,"DeepLearning.AI,Predibase",Intermediate,,Yes,Yes,Efficiently Serving LLMs,https://www.deeplearning.ai//short-courses/efficiently-serving-llms,Paid
 76 | DLA:javascript-rag-web-apps-with-llamaindex,"LlamaIndex,DeepLearning.AI",Beginner,,Yes,Yes,JavaScript RAG Web Apps with LlamaIndex,https://www.deeplearning.ai//short-courses/javascript-rag-web-apps-with-llamaindex,Paid
 77 | DLA:red-teaming-llm-applications,"DeepLearning.AI,Giskard",Beginner,,Yes,Yes,Red Teaming LLM Applications,https://www.deeplearning.ai//short-courses/red-teaming-llm-applications,Paid
 78 | DLA:preprocessing-unstructured-data-for-llm-applications,"Unstructured,DeepLearning.AI",Beginner,,Yes,Yes,Preprocessing Unstructured Data for LLM Applications,https://www.deeplearning.ai//short-courses/preprocessing-unstructured-data-for-llm-applications,Paid
 79 | DLA:quantization-fundamentals-with-hugging-face,"DeepLearning.AI,Hugging Face",Beginner,,Yes,Yes,Quantization Fundamentals with Hugging Face,https://www.deeplearning.ai//short-courses/quantization-fundamentals-with-hugging-face,Paid
 80 | DLA:getting-started-with-mistral,"DeepLearning.AI,Mistral AI",Beginner,,Yes,Yes,Getting Started With Mistral,https://www.deeplearning.ai//short-courses/getting-started-with-mistral,Paid
 81 | DLA:prompt-engineering-for-vision-models,"Comet,DeepLearning.AI",Beginner,,Yes,Yes,Prompt Engineering for Vision Models,https://www.deeplearning.ai//short-courses/prompt-engineering-for-vision-models,Paid
 82 | DLA:quantization-in-depth,"DeepLearning.AI,Hugging Face",Intermediate,,Yes,Yes,Quantization in Depth,https://www.deeplearning.ai//short-courses/quantization-in-depth,Paid
 83 | DLA:machine-learning-in-production,DeepLearning.AI,Intermediate,4 months,Yes,Yes,Machine Learning in Production,https://www.deeplearning.ai//courses/machine-learning-in-production,Paid
 84 | DLA:building-agentic-rag-with-llamaindex,"LlamaIndex,DeepLearning.AI",Beginner,,Yes,Yes,Building Agentic RAG with LlamaIndex,https://www.deeplearning.ai//short-courses/building-agentic-rag-with-llamaindex,Paid
 85 | DLA:building-multimodal-search-and-rag,"Weaviate,DeepLearning.AI",Intermediate,,Yes,Yes,Building Multimodal Search and RAG,https://www.deeplearning.ai//short-courses/building-multimodal-search-and-rag,Paid
 86 | DLA:multi-ai-agent-systems-with-crewai,"DeepLearning.AI,crewAI",Beginner,,Yes,Yes,Multi AI Agent Systems with crewAI,https://www.deeplearning.ai//short-courses/multi-ai-agent-systems-with-crewai,Paid
 87 | DLA:introduction-to-on-device-ai,"DeepLearning.AI,Qualcomm",Beginner,,Yes,Yes,Introduction to On-Device AI,https://www.deeplearning.ai//short-courses/introduction-to-on-device-ai,Paid
 88 | DLA:ai-agentic-design-patterns-with-autogen,"Microsoft,DeepLearning.AI,Penn State University",Beginner,,Yes,Yes,AI Agentic Design Patterns with AutoGen,https://www.deeplearning.ai//short-courses/ai-agentic-design-patterns-with-autogen,Paid
 89 | DLA:ai-agents-in-langgraph,"LangChain,Tavily,DeepLearning.AI",Intermediate,,Yes,Yes,AI Agents in LangGraph,https://www.deeplearning.ai//short-courses/ai-agents-in-langgraph,Paid
 90 | DLA:building-your-own-database-agent,"Microsoft,DeepLearning.AI",Beginner,,Yes,No,Building Your Own Database Agent,https://www.deeplearning.ai//short-courses/building-your-own-database-agent,Paid
 91 | DLA:function-calling-and-data-extraction-with-llms,"DeepLearning.AI,Nexusflow",Beginner,,Yes,Yes,Function-Calling and Data Extraction with LLMs,https://www.deeplearning.ai//short-courses/function-calling-and-data-extraction-with-llms,Paid
 92 | DLA:carbon-aware-computing-for-genai-developers,"DeepLearning.AI,Google Cloud",Beginner,,Yes,Yes,Carbon Aware Computing for GenAI Developers,https://www.deeplearning.ai//short-courses/carbon-aware-computing-for-genai-developers,Paid
 93 | DLA:prompt-compression-and-query-optimization,"DeepLearning.AI,MongoDB",Intermediate,,Yes,No,Prompt Compression and Query Optimization,https://www.deeplearning.ai//short-courses/prompt-compression-and-query-optimization,Paid
 94 | DLA:generative-ai-for-software-development,DeepLearning.AI,Intermediate,15 hours,Yes,No,Generative AI for Software Development,https://www.deeplearning.ai//courses/generative-ai-for-software-development,Paid
 95 | DLA:pretraining-llms,"DeepLearning.AI,Upstage",Intermediate,,Yes,No,Pretraining LLMs,https://www.deeplearning.ai//short-courses/pretraining-llms,Paid
 96 | DLA:intro-to-federated-learning,"DeepLearning.AI,Flower Labs",Beginner,,Yes,Yes,Federated Learning,https://www.deeplearning.ai//short-courses/intro-to-federated-learning,Paid
 97 | DLA:embedding-models-from-architecture-to-implementation,"DeepLearning.AI,Vectara",Beginner,,Yes,Yes,Embedding Models: From Architecture to Implementation,https://www.deeplearning.ai//short-courses/embedding-models-from-architecture-to-implementation,Paid
 98 | DLA:data-engineering,DeepLearning.AI,Intermediate,15 weeks,Yes,Yes,Data Engineering,https://www.deeplearning.ai//courses/data-engineering,Paid
 99 | DLA:ai-python-for-beginners,DeepLearning.AI,Beginner,,Yes,Yes,AI Python for Beginners,https://www.deeplearning.ai//short-courses/ai-python-for-beginners,Paid
100 | 


--------------------------------------------------------------------------------