├── .flake8
├── project_explainer
    ├── gh_explainer
    │   ├── __init__.py
    │   └── summarize.py
    ├── .gitignore
    ├── pyproject.toml
    └── examples
    │   └── examples.py
├── static
    ├── ui.png
    └── logos
    │   ├── logo.png
    │   └── logo.svg
├── graph_rag
    ├── evaluation
    │   ├── random
    │   │   ├── dataset_200_llama3.pkl
    │   │   ├── keras_docs_embedded.pkl
    │   │   └── results_5.csv
    │   ├── evaluation_llama_index.py
    │   ├── ragas_evaluation
    │   │   ├── prompts.py
    │   │   ├── evaluation_ragas.py
    │   │   └── QA_graphrag_testdataset.py
    │   └── README.MD
    ├── graph_builder
    │   ├── Example
    │   │   ├── random
    │   │   │   └── visualisation.png
    │   │   └── build_with_relic.MD
    │   ├── requirements.txt
    │   ├── main.py
    │   ├── README.MD
    │   ├── knowledgeGraph.py
    │   └── tools.py
    ├── experiments
    │   ├── artifacts
    │   │   ├── data_keras
    │   │   │   ├── index4.md
    │   │   │   ├── index5.md
    │   │   │   ├── index3.md
    │   │   │   ├── index1.md
    │   │   │   └── index2.md
    │   │   ├── gemma2
    │   │   │   └── gemma2graphIndex.pkl
    │   │   ├── mistral
    │   │   │   └── mistralgraphIndex.pkl
    │   │   ├── phi3
    │   │   │   └── graphIndex_phi3_mscb.pkl
    │   │   ├── vizualization
    │   │   │   └── visualisation.png
    │   │   ├── phi3-med
    │   │   │   └── graphIndex_phi3_medium_mscb.pkl
    │   │   └── neural_chat
    │   │   │   └── graphIndex_neuralchat_mscb.pkl
    │   └── EXPERIMENTS.MD
    └── graph_retrieval
    │   ├── training_scripts
    │       ├── prompt_tuning
    │       │   ├── config.yaml
    │       │   └── p_tuning.py
    │       └── QLoRA_tuning
    │       │   ├── config.yaml
    │       │   └── qlora_adapter.py
    │   ├── graph_retrieval.py
    │   └── README.MD
├── project_explainer_ui
    ├── README.md
    ├── requirements.txt
    ├── .gitignore
    └── ui.py
├── project_processor
    ├── README.md
    ├── .gitignore
    ├── pyproject.toml
    ├── gh_processor
    │   ├── github_downloader.py
    │   ├── __init__.py
    │   └── file_utils.py
    └── examples
    │   └── examples.py
├── .gitignore
├── .github
    └── workflows
    │   └── python-app.yml
├── experiments
    ├── experiment_t5_abs_summarization
    │   └── experiment_t5_abs_summarization.ipynb
    └── experiment_bart_ft_abs_summarization
    │   └── experiment_bart_ft_abs_summarization_eval.ipynb
├── README.md
└── LICENSE


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = *.ipynb


--------------------------------------------------------------------------------
/project_explainer/gh_explainer/__init__.py:
--------------------------------------------------------------------------------
1 | from .summarize import Explainer


--------------------------------------------------------------------------------
/static/ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/static/ui.png


--------------------------------------------------------------------------------
/static/logos/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/static/logos/logo.png


--------------------------------------------------------------------------------
/graph_rag/evaluation/random/dataset_200_llama3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/evaluation/random/dataset_200_llama3.pkl


--------------------------------------------------------------------------------
/graph_rag/evaluation/random/keras_docs_embedded.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/evaluation/random/keras_docs_embedded.pkl


--------------------------------------------------------------------------------
/graph_rag/graph_builder/Example/random/visualisation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/graph_builder/Example/random/visualisation.png


--------------------------------------------------------------------------------
/graph_rag/experiments/artifacts/data_keras/index4.md:
--------------------------------------------------------------------------------
1 | # KerasCV
2 | 
3 | These guides cover the [KerasCV](/keras_cv/) library.
4 | 
5 | ## Available guides
6 | 
7 | {{toc}}
8 | 


--------------------------------------------------------------------------------
/graph_rag/experiments/artifacts/data_keras/index5.md:
--------------------------------------------------------------------------------
1 | # KerasNLP
2 | 
3 | These guides cover the [KerasNLP](/keras_nlp/) library.
4 | 
5 | ## Available guides
6 | 
7 | {{toc}}
8 | 


--------------------------------------------------------------------------------
/graph_rag/experiments/artifacts/gemma2/gemma2graphIndex.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/gemma2/gemma2graphIndex.pkl


--------------------------------------------------------------------------------
/graph_rag/experiments/artifacts/mistral/mistralgraphIndex.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/mistral/mistralgraphIndex.pkl


--------------------------------------------------------------------------------
/graph_rag/experiments/artifacts/phi3/graphIndex_phi3_mscb.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/phi3/graphIndex_phi3_mscb.pkl


--------------------------------------------------------------------------------
/graph_rag/experiments/artifacts/vizualization/visualisation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/vizualization/visualisation.png


--------------------------------------------------------------------------------
/graph_rag/experiments/artifacts/phi3-med/graphIndex_phi3_medium_mscb.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/phi3-med/graphIndex_phi3_medium_mscb.pkl


--------------------------------------------------------------------------------
/graph_rag/experiments/artifacts/neural_chat/graphIndex_neuralchat_mscb.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/neural_chat/graphIndex_neuralchat_mscb.pkl


--------------------------------------------------------------------------------
/graph_rag/graph_builder/requirements.txt:
--------------------------------------------------------------------------------
 1 | llama-index-embeddings-huggingface
 2 | llama-index-llms-ollama
 3 | llama-index
 4 | pyvis
 5 | tree-sitter==0.21.3
 6 | tree-sitter-languages
 7 | tqdm
 8 | ragas
 9 | datasets
10 | pandas


--------------------------------------------------------------------------------
/project_explainer_ui/README.md:
--------------------------------------------------------------------------------
 1 | ### Project Explainer UI
 2 | 
 3 | Simple UI using Gradio.
 4 | 
 5 | ### Dependencies
 6 | 
 7 | ```
 8 | pip install -r requirements
 9 | ```
10 | 
11 | ### Start UI
12 | 
13 | ```
14 | python ui.py
15 | ```
16 | 


--------------------------------------------------------------------------------
/project_explainer_ui/requirements.txt:
--------------------------------------------------------------------------------
1 | gradio
2 | -e git+https://github.com/c2siorg/Project-Explainer.git@main#subdirectory=project_processor&egg=gh_processor
3 | -e git+https://github.com/c2siorg/Project-Explainer.git@main#subdirectory=project_explainer&egg=gh_explainer&egg=gh_explainer


--------------------------------------------------------------------------------
/project_processor/README.md:
--------------------------------------------------------------------------------
 1 | ## Project Repository Utilities
 2 | 
 3 | A simple python module packed with utilities to process files in a project repository such as git repositories.
 4 | 
 5 | ## Installation
 6 | 
 7 | ```
 8 | pip install git+https://github.com/c2siorg/Project-Explainer.git@main#subdirectory=project_processor&egg=gh_processor
 9 | ```
10 | 
11 | 


--------------------------------------------------------------------------------
/graph_rag/graph_builder/main.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file calls series of functions from graph_rag to build knowledge graph
 3 | """
 4 | 
 5 | from tools import initialize_llm, load_directory
 6 | from knowledgeGraph import build_graph, save_index
 7 | 
 8 | 
 9 | initialize_llm()
10 | documents = load_directory("/data")
11 | index = build_graph(documents)
12 | save_index(index)
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | dist/
11 | build/
12 | *.egg-info/
13 | 
14 | # Virtual environments
15 | venv/
16 | env/
17 | *.env
18 | 
19 | # Development tools
20 | .tox/
21 | .idea/
22 | .vscode/
23 | 
24 | # IDE-specific files
25 | *.swp
26 | *.swo
27 | *.pyc
28 | .DS_Store
29 | 


--------------------------------------------------------------------------------
/project_explainer/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | dist/
11 | build/
12 | *.egg-info/
13 | 
14 | # Virtual environments
15 | venv/
16 | env/
17 | *.env
18 | 
19 | # Development tools
20 | .tox/
21 | .idea/
22 | .vscode/
23 | 
24 | # IDE-specific files
25 | *.swp
26 | *.swo
27 | *.pyc
28 | .DS_Store
29 | 


--------------------------------------------------------------------------------
/project_processor/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | dist/
11 | build/
12 | *.egg-info/
13 | 
14 | # Virtual environments
15 | venv/
16 | env/
17 | *.env
18 | 
19 | # Development tools
20 | .tox/
21 | .idea/
22 | .vscode/
23 | 
24 | # IDE-specific files
25 | *.swp
26 | *.swo
27 | *.pyc
28 | .DS_Store
29 | 


--------------------------------------------------------------------------------
/project_explainer_ui/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | dist/
11 | build/
12 | *.egg-info/
13 | 
14 | # Virtual environments
15 | venv/
16 | env/
17 | *.env
18 | 
19 | # Development tools
20 | .tox/
21 | .idea/
22 | .vscode/
23 | 
24 | # IDE-specific files
25 | *.swp
26 | *.swo
27 | *.pyc
28 | .DS_Store
29 | 


--------------------------------------------------------------------------------
/graph_rag/graph_retrieval/training_scripts/prompt_tuning/config.yaml:
--------------------------------------------------------------------------------
 1 | Data:
 2 |   repo_path : '/content/keras-io/templates'
 3 |   extensions : ['md']
 4 |   output_file : 'merged_output.txt'
 5 | ModeL:
 6 |   model: 'bigcode/starcoderbase-1b'
 7 |   context_length: 128
 8 | Training:
 9 |   masked_language_modelling: False
10 |   num_virtual_tokens : 4
11 |   num_epochs : 6
12 |   learning_rate: 0.0035
13 |   output_dir: "/"
14 |   max_steps: 4
15 |   batch_size: 25
16 |   auto_batch_size : False
17 |   push_to_hub: False
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/project_explainer/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "gh_explainer"
 7 | description = "explains a give github repo"
 8 | readme = "README.md"
 9 | requires-python = ">=3.7"
10 | license = {text = "Apache 2.0"}
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 | ]
14 | dependencies = ["setuptools>=42", "wheel", "transformers", "jinja2", "torch"]
15 | 
16 | dynamic = ["version"]
17 | 
18 | [tool.setuptools]
19 | py-modules = ["gh_explainer"]
20 | 


--------------------------------------------------------------------------------
/project_processor/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "gh_processor"
 7 | description = "github repo file level processor utils"
 8 | readme = "README.md"
 9 | requires-python = ">=3.7"
10 | license = {text = "Apache 2.0"}
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 | ]
14 | dependencies = ["setuptools>=42", "wheel", "gitpython", "markdown2", "spacy"]
15 | 
16 | dynamic = ["version"]
17 | 
18 | [tool.setuptools]
19 | py-modules = ["gh_processor"]
20 | 


--------------------------------------------------------------------------------
/project_explainer_ui/ui.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | from gh_explainer import Explainer
 3 | 
 4 | def summarize(summarization_type, github_project_url, github_project_branch="main", huggingface_model_id="gpt2"):
 5 |     gptExplainer = Explainer(huggingface_model_id)
 6 |     if summarization_type == "brief":
 7 |         return gptExplainer.brief(github_url=github_project_url, branch=github_project_branch)["summary"]
 8 |     return gptExplainer.outline(github_url=github_project_url, branch=github_project_branch)["summary"]
 9 | 
10 | demo = gr.Interface(
11 |     fn=summarize,
12 |     inputs=[gr.Dropdown(["brief", "outline"], label="summary level"), "text", "text", "text"],
13 |     outputs=["text"],
14 | )
15 | demo.launch()


--------------------------------------------------------------------------------
/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/config.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   MODEL: "codellama/CodeLlama-7b-Instruct-hf"
 3 |   SEQ_LENGTH: 2048
 4 |   LOAD_IN_8BIT: False
 5 | 
 6 | DATA:
 7 |   REPO_PATH: '/content/keras-io/templates'
 8 |   SEED: 0
 9 |   EXTENSIONS: [ 'md' ]
10 |   OUTPUT_FILE: 'merged_output.txt'# Column name containing the code content
11 | 
12 | TRAINING_ARGUMENTS:
13 |   BATCH_SIZE: 64
14 |   GR_ACC_STEPS: 1
15 |   LR: 5e-4
16 |   LR_SCHEDULER_TYPE: "cosine"
17 |   WEIGHT_DECAY: 0.01
18 |   NUM_WARMUP_STEPS: 30
19 |   EVAL_FREQ: 100
20 |   SAVE_FREQ: 100
21 |   LOG_FREQ: 10
22 |   OUTPUT_DIR:
23 |   BF16: True
24 |   FP16: False
25 | 
26 | LORA:
27 |   LORA_R: 8
28 |   LORA_ALPHA: 32
29 |   LORA_DROPOUT: 0.0
30 |   LORA_TARGET_MODULES:
31 | 
32 | BNB_CONFIG:
33 |   USE_NESTED_QUANT: True
34 |   BNB_4BIT_COMPUTE_DTYPE: "bfloat16"
35 | 
36 | 


--------------------------------------------------------------------------------
/graph_rag/experiments/artifacts/data_keras/index3.md:
--------------------------------------------------------------------------------
 1 | # KerasCV Bounding Boxes
 2 | 
 3 | All KerasCV components that process bounding boxes require a `bounding_box_format`
 4 | argument.  This argument allows you to seamlessly integrate KerasCV components into
 5 | your own workflows while preserving proper behavior of the components themselves.
 6 | 
 7 | Bounding boxes are represented by dictionaries with two keys: `'boxes'` and `'classes'`:
 8 | 
 9 | ```
10 | {
11 |   'boxes': [batch, num_boxes, 4],
12 |   'classes': [batch, num_boxes]
13 | }
14 | ```
15 | 
16 | To ensure your bounding boxes comply with the KerasCV specification, you can use [`keras_cv.bounding_box.validate_format(boxes)`](https://github.com/keras-team/keras-cv/blob/master/keras_cv/bounding_box/validate_format.py).
17 | 
18 | The bounding box formats supported in KerasCV
19 | [are listed in the API docs](/api/keras_cv/bounding_box/formats)
20 | If a format you would like to use is missing,
21 | [feel free to open a GitHub issue on KerasCV](https://github.com/keras-team/keras-cv/issues)!
22 | 


--------------------------------------------------------------------------------
/project_processor/gh_processor/github_downloader.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from git import Repo
 3 | import os
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | logger.setLevel(logging.INFO)
 7 | 
 8 | console_handler = logging.StreamHandler()
 9 | console_handler.setLevel(logging.INFO)
10 | 
11 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
12 | console_handler.setFormatter(formatter)
13 | 
14 | logger.addHandler(console_handler)
15 | 
16 | 
17 | def download_github_repo(repo_url: str, branch: str = "main") -> str:
18 |     """
19 |     Download a GitHub repository from the provided URL.
20 | 
21 |     Args:
22 |         repo_url (str): The URL of the GitHub repository.
23 |         branch (str): The branch of the GitHub repository.
24 | 
25 |     Returns:
26 |         repo_path (str): Absolute path to downloaded repo
27 |     """
28 |     repo_name = repo_url.split("/")[-1].split(".")[0]
29 |     repo_path = os.path.abspath(repo_name)
30 | 
31 |     Repo.clone_from(repo_url, repo_name, branch=branch)
32 | 
33 |     logger.info(f"Repository '{repo_name}' downloaded successfully!")
34 |     return repo_path
35 | 


--------------------------------------------------------------------------------
/project_processor/gh_processor/__init__.py:
--------------------------------------------------------------------------------
 1 | from .github_downloader import download_github_repo
 2 | 
 3 | from .file_utils import (extract_code_blocks_from_markdown,
 4 |                          extract_headings_with_paragraphs_from_markdown,
 5 |                          extract_images_from_markdown,
 6 |                          extract_links_from_markdown,
 7 |                          extract_project_description_from_readme,
 8 |                          extract_tables_from_markdown,
 9 |                          get_files_by_extension,
10 |                          get_elements_from_markdown_file,
11 |                          remove_headings_from_markdown_file,
12 |                          remove_sections_from_markdown,
13 |                          convert_markdown_file_to_html,
14 |                          convert_markdown_to_html,
15 |                          check_phrase_similarity_using_spacyweb,
16 |                          check_similarity,
17 |                          remove_code_blocks_from_markdown,
18 |                          remove_images_from_markdown,
19 |                          remove_links_from_markdown,
20 |                          remove_tables_from_markdown)
21 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   build:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |     - name: Set up Python 3.10
23 |       uses: actions/setup-python@v3
24 |       with:
25 |         python-version: "3.10"
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install flake8 pytest
30 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31 |     - name: Lint with flake8
32 |       run: |
33 |         # stop the build if there are Python syntax errors or undefined names
34 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37 |     - name: Test with pytest
38 |       run: |
39 |         bash -c 'pytest .; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret'
40 | 


--------------------------------------------------------------------------------
/graph_rag/graph_builder/README.MD:
--------------------------------------------------------------------------------
 1 | # Knowledge Graph Builder
 2 | 
 3 | This project lets you build a Knowledge Graph from unstructured data (.md, .py files for now).
 4 | 
 5 | ## Table of Contents
 6 | 
 7 | - [Installation from Source](#installation-from-source)
 8 | - [Usage](#usage)
 9 |   <!--- [Data Preparation](#data-preparation)-->
10 |   <!--- [LLM Setup](#llm-setup)-->
11 |   <!--- [Build Graph Index](#build-graph-index)-->
12 | 
13 | ## Installation from Source
14 | 
15 | Follow these instructions to set up the project:
16 | 
17 | ```bash
18 | git clone https://github.com/debrupf2946/KnowledgeGraphBuilder.git
19 | cd KnowledgeGraphBuilder
20 | pip3 install -r requirements.txt
21 | ```
22 | 
23 | ## Usage
24 | 
25 | ### Data Preparation
26 | 
27 | 1. First, create or import a data directory at the root folder containing documents (.md files).
28 | 2. Copy the path of the directory.
29 | 3. Load and chunk the documents using `load_directory(PATH)`.
30 | 
31 | ```python
32 | documents = load_directory("/data")
33 | ```
34 | 
35 | ### LLM Setup
36 | 
37 | Users need to set up the LLM (llama3) locally to build the Knowledge Graph.
38 | 
39 | 1. Initialize the LLM with `initialize_llm()`.
40 | 2. The default parameters are:  
41 |     - `base_url="http://localhost:11434"` (Ollama server)
42 |     - `model="llama3"`
43 |     - `chunk_size = 512`
44 | 3. Change the parameters as needed.
45 | 
46 | ```python
47 | initialize_llm()
48 | ```
49 | 
50 | ### Build Graph Index
51 | 
52 | 1. Build the Knowledge Graph using the [documents](#data-preparation).
53 | 2. Call `build_graph(documents)` to create an index.
54 | 3. This will also save `Graph_visualization.html`, which can be opened in a browser to visualize the Knowledge Graph.
55 | 
56 | ```python
57 | index = build_graph(documents)
58 | ```
59 | 
60 | 4. Save the `index` as a pickle file.
61 | 
62 | ```python
63 | save_index(index)
64 | ```
65 | following example can referred for detailed implementation
66 | [GraphRag Example Notebook](Example/GraphRagExample.ipynb)
67 | 


--------------------------------------------------------------------------------
/graph_rag/evaluation/evaluation_llama_index.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script evaluates a RagDataset using a RagEvaluatorPack, which assesses query engines by benchmarking against
 3 | labeled data using LLMs and embeddings.
 4 | 
 5 | Functions:
 6 | - evaluate: Evaluates the query engine using a labeled RAG dataset and specified models for both the LLM and embeddings.
 7 | """
 8 | 
 9 | from llama_index.core.llama_dataset import LabelledRagDataset
10 | from llama_index.core.llama_pack import download_llama_pack
11 | from llama_index.llms.ollama import Ollama
12 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding
13 | 
14 | 
15 | 
16 | 
17 | 
18 | def evaluate(
19 |     RAG_DATASET: str,
20 |     query_engine: object,
21 |     ollama_model: str = "llama3",
22 |     embedd_model: str = "microsoft/codebert-base",
23 | ):
24 |     """
25 |     Evaluates a RAG dataset by using a query engine and benchmarks it using LLM and embedding models.
26 | 
27 |     Args:
28 |         RAG_DATASET: Path to the JSON file containing the labeled RAG dataset.
29 |         query_engine: The query engine to evaluate.
30 |         ollama_model: The LLM model to use for evaluation (default: "llama3").
31 |         embedd_model: The Hugging Face embedding model to use for evaluation (default: "microsoft/codebert-base").
32 | 
33 |     Returns:
34 |         A DataFrame containing the benchmarking results, including LLM calls and evaluations.
35 |     """
36 | 
37 |     RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./rag_evaluator_pack")
38 |     rag_dataset = LabelledRagDataset.from_json(RAG_DATASET)
39 |     rag_evaluator_pack = RagEvaluatorPack(
40 |         rag_dataset=rag_dataset,
41 |         query_engine=query_engine,
42 |         judge_llm=Ollama(base_url="http://localhost:11434", model=ollama_model),
43 |         embed_model=HuggingFaceEmbedding(model_name=embedd_model),
44 |     )
45 |     benchmark_df = await rag_evaluator_pack.arun(
46 |         batch_size=5,  # batches the number of llm calls to make
47 |         sleep_time_in_seconds=1,  # seconds to sleep before making an api call
48 |     )
49 |     return benchmark_df
50 | 


--------------------------------------------------------------------------------
/project_processor/examples/examples.py:
--------------------------------------------------------------------------------
 1 | from gh_processor import download_github_repo, extract_headings_with_paragraphs_from_markdown, get_files_by_extension
 2 | 
 3 | git_url = "https://github.com/c2siorg/Project-Explainer.git"
 4 | 
 5 | repo_path = download_github_repo(git_url)
 6 | 
 7 | print(repo_path)
 8 | 
 9 | markdown_files = get_files_by_extension(repo_path, [".md"])
10 | 
11 | headings_with_content = {}
12 | 
13 | print(markdown_files)
14 | 
15 | for markdown_file in markdown_files:
16 |     print(markdown_file)
17 |     headings_with_content[markdown_file] = extract_headings_with_paragraphs_from_markdown(markdown_file)
18 | 
19 | print(headings_with_content)
20 | 
21 | ## Output
22 | 
23 | # {'/Users/sripravallika/Project-Explainer/Project-Explainer/README.md': {'Project-Explainer': 'Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.'}}
24 | 


--------------------------------------------------------------------------------
/graph_rag/graph_builder/knowledgeGraph.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains the functions to build and save the KnowledgeGraph Index and save it as a pickle-file
 3 | """
 4 | 
 5 | from llama_index.core import StorageContext
 6 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 7 | from llama_index.core import KnowledgeGraphIndex
 8 | from llama_index.core.graph_stores import SimpleGraphStore
 9 | from pyvis.network import Network
10 | import os
11 | import pickle
12 | 
13 | 
14 | def build_graph(
15 |     documents: str,
16 |     llm: str = None,
17 |     max_triplets_per_chunk: int = 10,
18 |     embeddings: str = "microsoft/codebert-base",
19 |     include_embeddings: bool = False,
20 | ):
21 |     """
22 |     This function builds KnowledgeGraph Index that can be queried
23 |     Args:
24 |         documents: llama-index Document type object
25 |         llm:
26 |         max_triplets_per_chunk: Max triplets that can be extracted from each document chunk defaults:3
27 |         embeddings: Hugging-Face Embeddings model name default: microsoft/codebert-base
28 | 
29 |     Returns:
30 |         Knowledge Graph-index, also saves html visualization file
31 |     """
32 |     try:
33 |         graph_store = SimpleGraphStore()
34 |         storage_context = StorageContext.from_defaults(graph_store=graph_store)
35 |         index = KnowledgeGraphIndex.from_documents(
36 |             documents,
37 |             max_triplets_per_chunk=max_triplets_per_chunk,
38 |             llm=llm,
39 |             embed_model=HuggingFaceEmbedding(model_name=embeddings),
40 |             storage_context=storage_context,
41 |             include_embeddings=include_embeddings,
42 |         )
43 |         print("KG built successfully!")
44 | 
45 |         os.makedirs("results", exist_ok=True)
46 |         g = index.get_networkx_graph()
47 |         net = Network(notebook=True, cdn_resources="in_line", directed=True)
48 |         net.from_nx(g)
49 |         net.show("Graph_visualization.html")
50 |         return index
51 |     except Exception as e:
52 |         print(f"Error building graph: {e}")
53 |         return None
54 | 
55 | 
56 | def save_index(index, output_dir_path: str = "Results/"):
57 |     """
58 |     Serializes the index object, so that it can be loaded and used later
59 |     Args:
60 |         index: Graph-Index object
61 | 
62 |     Returns:
63 |         Saves pickle file of the Graph-Index
64 |     """
65 |     try:
66 |         os.makedirs(output_dir_path[:-1], exist_ok=True)
67 |         with open(output_dir_path + "graphIndex", "wb") as f:
68 |             pickle.dump(index, f)
69 |         print("Index saved successfully!")
70 |     except Exception as e:
71 |         print(f"Error saving index: {e}")
72 | 


--------------------------------------------------------------------------------
/graph_rag/graph_retrieval/graph_retrieval.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains methods for loading graph_index from pkl file and retrieval of graph_index
 3 | """
 4 | 
 5 | from ..graph_builder.tools import initialize_llm
 6 | import pickle
 7 | 
 8 | 
 9 | def get_index_from_pickle(
10 |     file_path: str = "results/graphIndex.pkl",
11 | ):
12 |     """
13 |     Deserializes a .pkl file to get the graph_index.
14 |     Args:
15 |         file_path (str): The path to the .pkl file.
16 | 
17 |     Returns:
18 |         object: The deserialized llama_index graph_index object.
19 | 
20 |     """
21 |     try:
22 |         with open(file_path, "rb") as file:
23 |             index = pickle.load(file)
24 |         return index
25 |     except FileNotFoundError:
26 |         print(f"File not found: {file_path}")
27 |         raise
28 |     except IOError as e:
29 |         print(f"Error reading file: {e}")
30 |         raise
31 |     except pickle.UnpicklingError as e:
32 |         print(f"Error deserializing file: {e}")
33 |         raise
34 | 
35 | 
36 | def get_query_engine(index, with_embedding: bool = False, similarity_top_k: int = 5):
37 |     """
38 |     create query-engine with preferred settings that is used to query graph_index
39 |     Args:
40 |         index (object): llama_index graph_index object
41 |         with_embedding (bool): switch to True to query graph_index with embeddings Default:False
42 |         similarity_top_k (int): Top number of chunks that is to be provided as context to llm for response to given query
43 | 
44 |     Returns:
45 |         object: llama_index query_engine object
46 | 
47 |     """
48 |     if index is None:
49 |         raise ValueError("The index must not be None.")
50 |     try:
51 |         initialize_llm()
52 |         if with_embedding:
53 |             query_engine = index.as_query_engine(
54 |                 include_text=True,
55 |                 response_mode="tree_summarize",
56 |                 embedding_mode="hybrid",
57 |                 similarity_top_k=similarity_top_k,
58 |             )
59 |         else:
60 |             query_engine = index.as_query_engine(
61 |                 include_text=True, response_mode="tree_summarize"
62 |             )
63 |         return query_engine
64 |     except Exception as e:
65 |         print(f"An error occurred while creating the query engine: {e}")
66 |         raise
67 | 
68 | 
69 | def graph_query(query: str, query_engine):
70 |     """
71 |     method to query graph_index
72 |     Args:
73 |         query (str): query that is to be answered using graph_rag
74 |         query_engine (object): llama_index query_engine object
75 | 
76 |     Returns:
77 |         str: response to the query in string
78 | 
79 |     """
80 |     if not query:
81 |         raise ValueError("The query must not be empty or None.")
82 | 
83 |     try:
84 |         response = query_engine.query(query)
85 |         print(response.response)
86 |         return response
87 |     except Exception as e:
88 |         print(f"An error occurred while querying: {e}")
89 |         raise
90 | 


--------------------------------------------------------------------------------
/graph_rag/graph_retrieval/README.MD:
--------------------------------------------------------------------------------
 1 | # Graph Index Retriever
 2 | 
 3 | This module provides methods for loading a graph index from a pickle file and querying it using a `llama_index` query engine.
 4 | 
 5 | ## Usage
 6 | 
 7 | ### Loading Graph Index from Pickle File
 8 | 
 9 | 1. Load the graph index from a pickle file using `get_index_from_pickle(file_path)`.
10 | 2. By default, the file path is set to `"results/graphIndex.pkl"`.
11 | 
12 | ```python
13 | from graph_rag.graph_retrieval.graph_retrieval import get_index_from_pickle
14 | 
15 | index = get_index_from_pickle("path/to/your/graphIndex.pkl")
16 | ```
17 | 
18 | ### Setting Up the Query Engine
19 | 
20 | 1. Initialize the LLM with `initialize_llm()`.
21 | 2. Create a query engine using `get_query_engine(index, with_embedding=False, similarity_top_k=5)`.
22 |    - `index`: The loaded `llama_index` graph index object.
23 |    - `with_embedding` (bool): Set to `True` to query the graph index with embeddings. Default is `False`.
24 |    - `similarity_top_k` (int): Number of top similar chunks to provide as context to LLM for responding to the query. Default is `5`.
25 | 
26 | ```python
27 | from graph_rag.graph_retrieval.graph_retrieval import get_query_engine
28 | 
29 | query_engine = get_query_engine(index, with_embedding=False, similarity_top_k=5)
30 | ```
31 | 
32 | ### Querying the Graph Index
33 | 
34 | 1. Query the graph index using `graph_query(query, query_engine)`.
35 |    - `query` (str): The query to be answered using `graph_rag`.
36 |    - `query_engine`: The `llama_index` query engine object.
37 | 
38 | ```python
39 | from graph_rag.graph_retrieval.graph_retrieval import graph_query
40 | 
41 | response = graph_query("Your query here", query_engine)
42 | print(response)
43 | ```
44 | ## Advanced Training with QLoRA and P-Tuning
45 | 
46 | >fine-tuning LLMs on data(masked language or Next toke Prediction) for few epochs, may result in better retrieval and response
47 | 
48 | ### 1. Setup
49 | 
50 | To use QLoRA and P-Tuning, ensure your environment is set up with the required libraries and that your model and dataset configurations are defined in a `config.yaml` file.
51 | 
52 | ### 2. Finetuning with QLoRA
53 | 
54 | Use the QLoRA method for efficient fine-tuning by passing the appropriate configurations in your `config.yaml`. This method is ideal when working with large models on limited hardware.
55 | 
56 | ```bash
57 | python qlora_adapter.py --config path/to/config.yaml
58 | ```
59 | Execute the training script with the `--config` argument to specify your configuration file:
60 | 
61 | ### 3. Fine-Tuning with P-Tuning 
62 | 
63 | P-Tuning allows for parameter-efficient prompt-based fine-tuning. Adjust the number of virtual tokens and other related parameters in the `config.yaml` to customize the training process.
64 | 
65 | ```bash
66 | python p_tuning.py--config path/to/config.yaml
67 | ```
68 | Execute the training script with the `--config` argument to specify your configuration file:
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | This will start the training process using the specified method (QLoRA or P-Tuning) and configurations.
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/graph_rag/experiments/artifacts/data_keras/index1.md:
--------------------------------------------------------------------------------
 1 | # KerasTuner
 2 | 
 3 | <a class="github-button" href="https://github.com/keras-team/keras-tuner" data-size="large" data-show-count="true" aria-label="Star keras-team/keras-tuner on GitHub">Star</a>
 4 | 
 5 | KerasTuner is an easy-to-use, scalable hyperparameter optimization framework
 6 | that solves the pain points of hyperparameter search. Easily configure your
 7 | search space with a define-by-run syntax, then leverage one of the available
 8 | search algorithms to find the best hyperparameter values for your models.
 9 | KerasTuner comes with Bayesian Optimization, Hyperband, and Random Search algorithms
10 | built-in, and is also designed to be easy for researchers to extend in order to
11 | experiment with new search algorithms.
12 | 
13 | ---
14 | ## Quick links
15 | 
16 | * [Getting started with KerasTuner](/guides/keras_tuner/getting_started/)
17 | * [KerasTuner developer guides](/guides/keras_tuner/)
18 | * [KerasTuner API reference](/api/keras_tuner/)
19 | * [KerasTuner on GitHub](https://github.com/keras-team/keras-tuner)
20 | 
21 | 
22 | ---
23 | ## Installation
24 | 
25 | Install the latest release:
26 | 
27 | ```
28 | pip install keras-tuner --upgrade
29 | ```
30 | 
31 | You can also check out other versions in our
32 | [GitHub repository](https://github.com/keras-team/keras-tuner).
33 | 
34 | 
35 | ---
36 | ## Quick introduction
37 | 
38 | Import KerasTuner and TensorFlow:
39 | 
40 | ```python
41 | import keras_tuner
42 | import keras
43 | ```
44 | 
45 | Write a function that creates and returns a Keras model.
46 | Use the `hp` argument to define the hyperparameters during model creation.
47 | 
48 | ```python
49 | def build_model(hp):
50 |   model = keras.Sequential()
51 |   model.add(keras.layers.Dense(
52 |       hp.Choice('units', [8, 16, 32]),
53 |       activation='relu'))
54 |   model.add(keras.layers.Dense(1, activation='relu'))
55 |   model.compile(loss='mse')
56 |   return model
57 | ```
58 | 
59 | Initialize a tuner (here, `RandomSearch`).
60 | We use `objective` to specify the objective to select the best models,
61 | and we use `max_trials` to specify the number of different models to try.
62 | 
63 | ```python
64 | tuner = keras_tuner.RandomSearch(
65 |     build_model,
66 |     objective='val_loss',
67 |     max_trials=5)
68 | ```
69 | 
70 | Start the search and get the best model:
71 | 
72 | ```python
73 | tuner.search(x_train, y_train, epochs=5, validation_data=(x_val, y_val))
74 | best_model = tuner.get_best_models()[0]
75 | ```
76 | 
77 | To learn more about KerasTuner, check out [this starter guide](https://keras.io/guides/keras_tuner/getting_started/).
78 | 
79 | 
80 | ---
81 | ## Citing KerasTuner
82 | 
83 | If KerasTuner helps your research, we appreciate your citations.
84 | Here is the BibTeX entry:
85 | 
86 | ```bibtex
87 | @misc{omalley2019kerastuner,
88 | 	title        = {KerasTuner},
89 | 	author       = {O'Malley, Tom and Bursztein, Elie and Long, James and Chollet, Fran\c{c}ois and Jin, Haifeng and Invernizzi, Luca and others},
90 | 	year         = 2019,
91 | 	howpublished = {\url{https://github.com/keras-team/keras-tuner}}
92 | }
93 | ```
94 | 


--------------------------------------------------------------------------------
/graph_rag/evaluation/ragas_evaluation/prompts.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains PROMPTS that are passed to llms to generate and critique Test-Dataset for Graph_Rag
 3 | """
 4 | 
 5 | QA_generation_prompt = """
 6 | Your task is to write a factoid question and an answer given a context.
 7 | Your factoid question should be answerable with a specific, concise piece of factual information from the context.
 8 | Your factoid question should be formulated in the same style as questions users could ask in a search engine.
 9 | This means that your factoid question MUST NOT mention something like "according to the passage" or "context".
10 | 
11 | Provide your answer as follows:
12 | 
13 | Output:::
14 | Factoid question: (your factoid question)
15 | Answer: (your answer to the factoid question)
16 | 
17 | Now here is the context.
18 | 
19 | Context: {context}\n
20 | Output:::"""
21 | 
22 | question_groundedness_critique_prompt = """
23 | You will be given a context and a question.
24 | Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
25 | Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.
26 | 
27 | Provide your answer as follows:
28 | 
29 | Answer:::
30 | Evaluation: (your rationale for the rating, as a text)
31 | Total rating: (your rating, as a number between 1 and 5)
32 | 
33 | You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
34 | 
35 | Now here are the question and context.
36 | 
37 | Question: {question}\n
38 | Context: {context}\n
39 | Answer::: """
40 | 
41 | question_relevance_critique_prompt = """
42 | You will be given a question.
43 | Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
44 | Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.
45 | 
46 | Provide your answer as follows:
47 | 
48 | Answer:::
49 | Evaluation: (your rationale for the rating, as a text)
50 | Total rating: (your rating, as a number between 1 and 5)
51 | 
52 | You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
53 | 
54 | Now here is the question.
55 | 
56 | Question: {question}\n
57 | Answer::: """
58 | 
59 | question_standalone_critique_prompt = """
60 | You will be given a question.
61 | Your task is to provide a 'total rating' representing how context-independant this question is.
62 | Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
63 | For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
64 | The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.
65 | 
66 | For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.
67 | 
68 | Provide your answer as follows:
69 | 
70 | Answer:::
71 | Evaluation: (your rationale for the rating, as a text)
72 | Total rating: (your rating, as a number between 1 and 5)
73 | 
74 | You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
75 | 
76 | Now here is the question.
77 | 
78 | Question: {question}\n
79 | Answer::: """
80 | 


--------------------------------------------------------------------------------
/static/logos/logo.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- Do not edit this file with editors other than draw.io -->
3 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
4 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="1012px" height="213px" viewBox="-0.5 -0.5 1012 213" class="ge-export-svg-dark" content="&lt;mxfile host=&quot;app.diagrams.net&quot; modified=&quot;2023-08-27T18:25:05.842Z&quot; agent=&quot;Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36&quot; version=&quot;21.6.9&quot; etag=&quot;EydBkpSqV1pLCtx2EZL-&quot; type=&quot;device&quot;&gt;&lt;diagram name=&quot;Page-1&quot; id=&quot;IUgDtr-eqlT9z4rcJgFV&quot;&gt;rZRNb5wwEIZ/DceuAC80PQZ2kx7aKlIOPVZe7IAbw1BjAttf3zG2F1g2Uiv1AvY7Hx7PPBCQvB4fFW2rr8C4DOKQjQE5BHGcpgSfRjhbYb9PrVAqwawUzcKz+M2dGDq1F4x3K0cNILVo12IBTcMLvdKoUjCs3V5Ark9tack3wnNB5Vb9LpiurHqXhLP+mYuy8idHobPU1Ds7oasog2EhkWNAcgWg7aoecy5N73xfbNzDO9ZLYYo3+m8CYhvwRmXv7ubq0md/Wei1FA3PL70MA5Ix2lWcuc0bV1pgd77QE5dP0AktoEHbCbSGeuFwL0VpDBpaVKnbFVgrVyhUupa4j3CJfWnN+fVYGoJ2dOjIruNUFdUPBkVfYwwOMXsRUuYgQU21kofkLiF71DGKCfTxtgYaTJe522I5fHy3Y9FlDsgvh5prdUYXH/DJjc6xS/zUh5mE1EnVAgLPPHXslZfM83hw4SZ0e1rkxrRSqU0fAKtHkmlhDemv3iCU5dArgc2Nw298mOXVhL1ocnzopo/tHh3ifTtOrnNQWpr3k4KfhoQ4PI6tpMiG8mXgBWwl1nPDErZdrwe9geCalVowZsIzxbE2eppSGepaEBMDhyDJguRgcvUabP2WIa3glV8RsATm/0ERJ1dQpNEu2WARf7zBxT78Zy5wO/8hJtviN0uOfwA=&lt;/diagram&gt;&lt;/mxfile&gt;" style="background-color: rgb(18, 18, 18);"><defs><style type="text/css">svg.ge-export-svg-dark &gt; * { filter: invert(100%) hue-rotate(180deg); }&#xa;svg.ge-export-svg-dark image { filter: invert(100%) hue-rotate(180deg) }</style></defs><g><path d="M 25.62 198 L 25.62 96.71 L 9 115.34 L 9 47.29 L 25.62 65.84 L 25.62 9 L 172.29 9 L 172.29 65.88 L 189 47.16 L 189 115.38 L 172.29 96.66 L 172.29 198 Z" fill="#f58534" stroke="none" pointer-events="all"/><path d="M 25.62 190.08 L 172.29 190.08 L 172.29 198 L 25.62 198 Z" fill-opacity="0.3" fill="#000000" stroke="none" pointer-events="all"/><rect x="9" y="9" width="0" height="0" fill="none" stroke="rgb(0, 0, 0)" stroke-width="3" pointer-events="all"/><path d="M 30.33 81.27 L 15.67 64.76 L 15.67 97.74 Z M 39.36 81.23 L 9 115.34 L 9 47.29 Z M 167.71 81.23 L 182.33 97.74 L 182.33 64.8 Z M 158.64 81.27 L 189 47.16 L 189 115.38 Z M 42.33 43.88 L 42.33 26.96 L 155.67 26.96 L 155.67 43.88 Z M 42.33 69.52 L 42.33 52.6 L 155.67 52.6 L 155.67 69.52 Z M 42.33 95.18 L 42.33 78.3 L 155.67 78.3 L 155.67 95.18 Z M 42.33 120.82 L 42.33 103.95 L 155.67 103.95 L 155.67 120.82 Z M 42.33 146.48 L 42.33 129.55 L 155.67 129.55 L 155.67 146.48 Z M 42.33 172.13 L 42.33 155.21 L 155.67 155.21 L 155.67 172.13 Z" fill="#ffffff" stroke="none" pointer-events="all"/><rect x="189" y="43.5" width="810" height="120" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(3)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 35px; margin-left: 198px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;"><font style="font-size: 24px;" face="Courier New">Project Explainer</font></div></div></div></foreignObject><text x="198" y="38" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Project Explainer</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.drawio.com/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Text is not SVG - cannot display</text></a></switch></svg>


--------------------------------------------------------------------------------
/graph_rag/graph_builder/tools.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains functions for initializing llm for
  3 | 1. building KnowledgeGraph
  4 | 2. loading documents from directory (also function for splitting code files)
  5 | 3. converting llama-index Node to llama-index Documents
  6 | """
  7 | 
  8 | from llama_index.llms.ollama import Ollama
  9 | from llama_index.core import SimpleDirectoryReader
 10 | from llama_index.core.node_parser import CodeSplitter
 11 | from llama_index.core import Document
 12 | from llama_index.core import Settings
 13 | 
 14 | 
 15 | def initialize_llm(
 16 |     base_url: str = "http://localhost:11434",
 17 |     model: str = "llama3",
 18 |     chunk_size: int = 512,
 19 | ):
 20 |     """
 21 |     Initializes the llm for building the KnowledgeGraph
 22 |     Args:
 23 |         base_url: The ollama server URL where the model is listening
 24 |         model: The model string that ollama is hosting and will be used to build the KnowledgeGraph
 25 |         chunk_size: The Documents uploaded will be chunked, it represents size of each chunk
 26 | 
 27 |     Returns:
 28 |         None
 29 |     """
 30 |     try:
 31 |         llm = Ollama(base_url=base_url, model=model)
 32 |         Settings.llm = llm
 33 |         Settings.chunk_size = chunk_size
 34 |         print(f"{model} initialized successfully!")
 35 |     except Exception as e:
 36 |         print(f"Error initializing LLM: {e}")
 37 | 
 38 | 
 39 | def code_splitting(documents, language: str = "python"):
 40 |     """
 41 |     If the KnowledgeGraph is to be built for code-files then files are split using this function
 42 |     Args:
 43 |         documents: llama-index Document type object, then coding-files Document
 44 |         language: The language of coding-file
 45 | 
 46 |     Returns:
 47 |         nodes: Split code chunks, llama-index Nodes type object
 48 |     """
 49 |     try:
 50 |         splitter = CodeSplitter(
 51 |             language=language,
 52 |             chunk_lines=30,  # lines per chunk
 53 |             chunk_lines_overlap=6,  # lines overlap between chunks
 54 |             max_chars=1500,  # max chars per chunk
 55 |         )
 56 |         nodes = splitter.get_nodes_from_documents(documents)
 57 |         print(f"{len(nodes)} nodes created successfully!")
 58 |         return nodes
 59 |     except Exception as e:
 60 |         print(f"Error splitting code: {e}")
 61 |         return []
 62 | 
 63 | 
 64 | def convert_nodes_to_docs(nodes):
 65 |     """
 66 |     Converts llama-index Nodes Type object to llama-index Document Type objects
 67 |     Args:
 68 |         nodes: llama-index Nodes type object
 69 |     Returns:
 70 |         llama-index Document Type objects
 71 |     """
 72 |     try:
 73 |         documents_from_nodes = [
 74 |             Document(text=node.text, metadata=node.metadata) for node in nodes
 75 |         ]
 76 |         print(
 77 |             f"{len(documents_from_nodes)} number of documents converted successfully!"
 78 |         )
 79 |         return documents_from_nodes
 80 |     except Exception as e:
 81 |         print(f"Error converting nodes to documents: {e}")
 82 |         return []
 83 | 
 84 | 
 85 | def load_directory(
 86 |     directory_path: str, code_file: bool = False, language: str = "python"
 87 | ):
 88 |     """
 89 |     Loads the documentation-directory, does preprocessing and chunking depending on code_file parameter
 90 |     Args:
 91 |         directory_path: Path to the Files Directory from which Knowledge graph is to be made
 92 |         code_file: Bool that specifies that given directory contains code files or not
 93 |         language: language of the code-files
 94 |     Returns:
 95 |         llama-index Document Type objects
 96 |     """
 97 |     try:
 98 |         documents = SimpleDirectoryReader(directory_path).load_data()
 99 |     except Exception as e:
100 |         print(f"Error loading directory: {e}")
101 |         return []
102 | 
103 |     try:
104 |         if code_file:
105 |             nodes = code_splitting(documents, language)
106 |             docs = convert_nodes_to_docs(nodes)
107 |             print(f"{len(docs)} documents loaded successfully!")
108 |             return docs
109 | 
110 |         print(f"{len(documents)} documents loaded successfully!")
111 |         return documents
112 |     except Exception as e:
113 |         print(f"Error processing documents: {e}")
114 |         return []
115 | 


--------------------------------------------------------------------------------
/graph_rag/evaluation/ragas_evaluation/evaluation_ragas.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script loads a pre-processed dataset, slices it for batch evaluation, and runs a series of metrics to evaluate the
  3 | performance of a query engine using a language model and embeddings.
  4 | 
  5 | Functions:
  6 | - load_test_dataset: Loads a test dataset from a pickle file.
  7 | - slice_data: Slices the dataset into batches for evaluation.
  8 | - evaluate: Runs evaluation on the sliced dataset using specified metrics, LLMs, and embeddings.
  9 | 
 10 | """
 11 | 
 12 | import pickle
 13 | import pandas as pd
 14 | from datasets import Dataset
 15 | from ragas.integrations.llama_index import evaluate
 16 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 17 | from ragas.metrics.critique import harmfulness
 18 | from llama_index.llms.ollama import Ollama
 19 | from ragas.metrics import (
 20 |     faithfulness,
 21 |     answer_relevancy,
 22 |     context_precision,
 23 |     context_recall,
 24 | )
 25 | 
 26 | 
 27 | def load_test_dataset(
 28 |     data: str,
 29 | ):
 30 |     """
 31 |        Loads a test dataset from a pickle file.
 32 | 
 33 |        Args:
 34 |            data: The path to the dataset file in pickle format.
 35 | 
 36 |        Returns:
 37 |            A dictionary representing the loaded dataset or an empty dictionary if loading fails due to EOFError.
 38 |        """
 39 |     try:
 40 |         with open(data, "rb") as f:
 41 |             dataset = pickle.load(f)
 42 |     except EOFError:
 43 |         print("EOFError: The file may be corrupted or incomplete loading empty dictionary.")
 44 |         dataset = []
 45 |     return dataset
 46 | 
 47 | 
 48 | def slice_data(i: int, k: int, dataset: list):
 49 |     """
 50 |         Slices the dataset into smaller chunks for batch processing.
 51 | 
 52 |         Args:
 53 |             i: The starting index for the slice.
 54 |             k: The size of the slice (number of records to include in each batch).
 55 |             dataset: The dictionary representing the dataset to be sliced.
 56 | 
 57 |         Returns:
 58 |             A dictionary containing the sliced dataset with renamed columns for consistency with the evaluation process.
 59 |         """
 60 | 
 61 |     hf_dataset = Dataset.from_list(dataset[i : i + k])
 62 |     hf_dataset = hf_dataset.rename_column("context", "contexts")
 63 |     hf_dataset = hf_dataset.rename_column("answer", "ground_truth")
 64 |     ds_dict = hf_dataset.to_dict()
 65 |     return ds_dict
 66 | 
 67 | 
 68 | def evaluate(
 69 |     query_engine: object,
 70 |     dataset: object,
 71 |     batch: int = 4,
 72 |     metrics: list = [
 73 |         faithfulness,
 74 |         answer_relevancy,
 75 |         context_precision,
 76 |         context_recall,
 77 |     ],
 78 |     llm: object = Ollama(base_url="http://localhost:11434", model="codellama"),
 79 |     embeddings=HuggingFaceEmbedding(model_name="microsoft/codebert-base"),
 80 | ):
 81 |     """
 82 |        Evaluates the performance of a query engine on a dataset using various metrics and a language model.
 83 | 
 84 |        Args:
 85 |            query_engine: The query engine to be evaluated.
 86 |            dataset: The dataset to be evaluated against.
 87 |            batch: The number of records to process in each batch (default: 4).
 88 |            metrics: A list of metrics to be used for evaluation (default: faithfulness, answer relevancy, context precision, and context recall).
 89 |            llm: The language model to be used for evaluation (default: Ollama with model 'codellama').
 90 |            embeddings: The embedding model to be used (default: HuggingFaceEmbedding with 'microsoft/codebert-base').
 91 | 
 92 |        Returns:
 93 |            A pandas DataFrame containing the evaluation results for each batch.
 94 |        """
 95 | 
 96 |     rows_count = len(next(iter(dataset.values())))
 97 | 
 98 |     results_df = pd.DataFrame()
 99 | 
100 |     for i in range(0, rows_count, batch):
101 | 
102 |         batch_data = slice_data(i, batch, dataset=dataset)
103 | 
104 |         result = evaluate(
105 |             query_engine=query_engine,
106 |             metrics=metrics,
107 |             dataset=batch_data,
108 |             llm=llm,
109 |             embeddings=embeddings,
110 |         )
111 | 
112 |         rdf = result.to_pandas()
113 |         results_df = pd.concat([results_df, rdf], ignore_index=True)
114 |         print(f"Processed batch {i // batch + 1}:")
115 |         print(rdf)
116 |     print(results_df)
117 |     results_df.to_csv("results.csv", index=False)
118 |     return results_df
119 | 


--------------------------------------------------------------------------------
/graph_rag/evaluation/ragas_evaluation/QA_graphrag_testdataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script contains functions to generate question-answer pairs from input documents using a language model,
  3 | and critique them based on various criteria like groundedness, relevance, and standalone quality.
  4 | 
  5 | Functions:
  6 | - get_response: Sends a request to a language model API to generate responses based on a provided prompt.
  7 | - qa_generator: Generates a specified number of question-answer pairs from input documents.
  8 | - critique_qa: Critiques the generated QA pairs based on groundedness, relevance, and standalone quality.
  9 | """
 10 | 
 11 | from prompts import *
 12 | import pandas as pd
 13 | import random
 14 | from tqdm.auto import tqdm
 15 | import requests
 16 | 
 17 | 
 18 | def get_response(
 19 |     prompt: str, url: str = "http://localhost:11434/api/generate", model: str = "llama3"
 20 | ):
 21 |     """
 22 |     Sends a prompt ollama API and retrieves the generated response.
 23 | 
 24 |     Args:
 25 |         prompt:The text input that the model will use to generate a response.
 26 |         url: The API endpoint for the model (default: "http://localhost:11434/api/generate").
 27 |         model: The model to be used for generation (default: "llama3").
 28 | 
 29 |     Returns:
 30 |         The generated response from the language model as a string.
 31 |     """
 32 | 
 33 |     payload = {"model": model, "prompt": prompt, "stream": False}
 34 |     response = requests.post(url, json=payload)
 35 |     resp = response.json()
 36 |     return resp["response"]
 37 | 
 38 | 
 39 | def qa_generator(
 40 |     documents: object,
 41 |     N_GENERATIONS: int = 20,
 42 | ):
 43 |     """
 44 |     Generates a specified number of question-answer pairs from the provided documents.
 45 | 
 46 |     Args:
 47 |         documents: A collection of document objects to generate QA pairs from.
 48 |         N_GENERATIONS: The number of question-answer pairs to generate (default: 20).
 49 | 
 50 |     Returns:
 51 |         A list of dictionaries, each containing the generated context, question, answer, and source document metadata.
 52 |     """
 53 |     print(f"Generating {N_GENERATIONS} QA couples...")
 54 | 
 55 |     outputs = []
 56 |     for sampled_context in tqdm(random.sample(documents, N_GENERATIONS)):
 57 |         # Generate QA couple
 58 |         output_QA_couple = get_response(
 59 |             QA_generation_prompt.format(context=sampled_context.text)
 60 |         )
 61 |         try:
 62 |             question = output_QA_couple.split("Factoid question: ")[-1].split(
 63 |                 "Answer: "
 64 |             )[0]
 65 |             answer = output_QA_couple.split("Answer: ")[-1]
 66 |             assert len(answer) < 300, "Answer is too long"
 67 |             outputs.append(
 68 |                 {
 69 |                     "context": sampled_context.text,
 70 |                     "question": question,
 71 |                     "answer": answer,
 72 |                     "source_doc": sampled_context.metadata,
 73 |                 }
 74 |             )
 75 |         except:
 76 |             continue
 77 |     df = pd.DataFrame(outputs)
 78 |     df.to_csv("QA.csv")
 79 |     return outputs
 80 | 
 81 | 
 82 | def critique_qa(
 83 |     outputs: list,
 84 | ):
 85 |     """
 86 |     Critiques the generated question-answer pairs based on groundedness, relevance, and standalone quality.
 87 | 
 88 |     Args:
 89 |         outputs: A list of dictionaries containing generated QA pairs to be critiqued.
 90 | 
 91 |     Returns:
 92 |         The critiqued QA pairs with additional fields for groundedness, relevance, and standalone quality scores and evaluations.
 93 |     """
 94 |     print("Generating critique for each QA couple...")
 95 |     for output in tqdm(outputs):
 96 |         evaluations = {
 97 |             "groundedness": get_response(
 98 |                 question_groundedness_critique_prompt.format(
 99 |                     context=output["context"], question=output["question"]
100 |                 ),
101 |             ),
102 |             "relevance": get_response(
103 |                 question_relevance_critique_prompt.format(question=output["question"]),
104 |             ),
105 |             "standalone": get_response(
106 |                 question_standalone_critique_prompt.format(question=output["question"]),
107 |             ),
108 |         }
109 |         try:
110 |             for criterion, evaluation in evaluations.items():
111 |                 score, eval = (
112 |                     int(evaluation.split("Total rating: ")[-1].strip()),
113 |                     evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
114 |                 )
115 |                 output.update(
116 |                     {
117 |                         f"{criterion}_score": score,
118 |                         f"{criterion}_eval": eval,
119 |                     }
120 |                 )
121 |         except Exception as e:
122 |             continue
123 |         generated_questions = pd.DataFrame.from_dict(outputs)
124 |         generated_questions = generated_questions.loc[
125 |             (generated_questions["groundedness_score"] >= 4)
126 |             & (generated_questions["relevance_score"] >= 4)
127 |             & (generated_questions["standalone_score"] >= 4)
128 |         ]
129 |         generated_questions.to_csv("generated_questions.csv")
130 |         return outputs
131 | 


--------------------------------------------------------------------------------
/project_explainer/examples/examples.py:
--------------------------------------------------------------------------------
1 | from gh_explainer import Explainer
2 | 
3 | gptExplainer = Explainer("gpt2")
4 | 
5 | print(gptExplainer.brief("https://github.com/c2siorg/Project-Explainer.git"))
6 | 
7 | ## output
8 | 
9 | # {'prompt': {'prompt': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.'}, 'prepared_prompt': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.\nExplain the above : ', 'summary': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.\nExplain the above : \xa0The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model.'}


--------------------------------------------------------------------------------
/graph_rag/graph_builder/Example/build_with_relic.MD:
--------------------------------------------------------------------------------
  1 | # Knowledge Graph with Relik and Llama-Index
  2 | 
  3 | This markdown file demonstrates an experiment in building a knowledge graph using the `Relik` and `Llama-Index` Property Graphs. The steps include coreference resolution with `Spacy`, relation extraction with `Relik`, and knowledge graph construction with `llama-index PropertyGraphs`,stored in `neo4j`.
  4 | 
  5 | ## Import Necessary Libraries
  6 | 
  7 | Import the essential libraries required for the experiment. These include NLP tools (`Spacy`, `coreferee`), document readers, large language models (LLMs), embeddings, and Neo4j for graph storage.
  8 | 
  9 | ```python
 10 | import spacy, coreferee
 11 | from llama_index.core import SimpleDirectoryReader
 12 | import nest_asyncio
 13 | from llama_index.llms.ollama import Ollama
 14 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 15 | from llama_index.core import PropertyGraphIndex
 16 | from llama_index.core import Settings
 17 | from llama_index.extractors.relik.base import RelikPathExtractor
 18 | from llama_index.graph_stores.neo4j import Neo4jPGStore
 19 | ```
 20 | 
 21 | ## Coreference Resolution Function
 22 | 
 23 | Sets up a function to resolve coreferences in a text. This is crucial for ensuring that the references to entities like "she" or "it" are correctly linked back to their antecedents,removing de-duplication of nodes from knowledge graph.
 24 | 
 25 | ```python
 26 | coref_nlp = spacy.load('en_core_web_lg')
 27 | coref_nlp.add_pipe('coreferee')
 28 | 
 29 | def coref_text(text):
 30 |     coref_doc = coref_nlp(text)
 31 |     resolved_text = ""
 32 | 
 33 |     for token in coref_doc:
 34 |         repres = coref_doc._.coref_chains.resolve(token)
 35 |         if repres:
 36 |             resolved_text += " " + " and ".join(
 37 |                 [
 38 |                     t.text
 39 |                     if t.ent_type_ == ""
 40 |                     else [e.text for e in coref_doc.ents if t in e][0]
 41 |                     for t in repres
 42 |                 ]
 43 |             )
 44 |         else:
 45 |             resolved_text += " " + token.text
 46 | 
 47 |     return resolved_text
 48 | ```
 49 | 
 50 | ### Example Usage of Coreference Resolution
 51 | 
 52 | An example is provided to demonstrate how the `coref_text` function resolves references in the text. 
 53 | 
 54 | ```python
 55 | coref_text("alice is great. she can study for long hours and remember")
 56 | # Output: alice is great. alice can study for long hours and remember
 57 | ```
 58 | 
 59 | ## Load and Process Documents
 60 | 
 61 | The documents are loaded from a specified directory and processed with the coreference resolution function to prepare them for knowledge graph construction.
 62 | 
 63 | ```python
 64 | documents = SimpleDirectoryReader(input_dir='/content/data').load_data()
 65 | len(documents)
 66 | 
 67 | for doc in documents:
 68 |     doc.text = coref_text(doc.text)
 69 | ```
 70 | 
 71 | ## Initialize Relik Path Extractor
 72 | 
 73 | Here, the `RelikPathExtractor` is initialized, which will be used to extract relationships between entities from the processed documents.
 74 | 
 75 | ```python
 76 | relik = RelikPathExtractor(
 77 |     model="relik-ie/relik-relation-extraction-small", model_config={"skip_metadata": True}
 78 | )
 79 | ```
 80 | 
 81 | ## Set Up Language Model and Embeddings
 82 | 
 83 | This section configures the LLM (`Ollama`) and the embedding model (`HuggingFaceEmbedding`) to be used for generating embeddings for the knowledge graph.
 84 | 
 85 | ```python
 86 | llm = Ollama(base_url="http://localhost:11434", model="llama3.1")
 87 | embed_model = HuggingFaceEmbedding(model_name="microsoft/codebert-base")
 88 | Settings.llm = llm
 89 | ```
 90 | 
 91 | ## Configure Neo4j Graph Store
 92 | 
 93 | Sets up the connection to a Neo4j database, where the knowledge graph will be stored. Ensure to replace the placeholder for the password with your actual Neo4j password.
 94 | 
 95 | ```python
 96 | username = "neo4j"
 97 | password = "*****************************"
 98 | url = "neo4j+s://45256b03.databases.neo4j.io"
 99 | 
100 | graph_store = Neo4jPGStore(
101 |     username=username,
102 |     password=password,
103 |     url=url,
104 |     refresh_schema=False
105 | )
106 | ```
107 | 
108 | ## Build the Knowledge Graph
109 | 
110 | Here, the knowledge graph is constructed from the processed documents using the configured tools: `Relik`, `Ollama`, `HuggingFaceEmbedding`, and `Neo4j`.
111 | 
112 | ```python
113 | index = PropertyGraphIndex.from_documents(
114 |     documents,
115 |     kg_extractors=[relik],
116 |     llm=llm,
117 |     embed_model=embed_model,
118 |     property_graph_store=graph_store,
119 |     show_progress=True,
120 | )
121 | ```
122 | ![Alt text](random/visualisation.png)
123 | 
124 | 
125 | ## Query the Knowledge Graph
126 | 
127 | Finally, a query engine is created, allowing you to query the knowledge graph. Example queries and their expected outputs are provided.
128 | 
129 | ```python
130 | query_engine = index.as_query_engine(include_text=True)
131 | 
132 | response = query_engine.query("what is keras nlp?")
133 | print(str(response))
134 | 
135 | # Output: Keras NLP provides a simple way to fine-tune pre-trained language models for various natural language processing tasks...
136 | ```
137 | 
138 | ```python
139 | response = query_engine.query("format for citing keras nlp")
140 | print(str(response))
141 | 
142 | # Output: To cite Keras NLP, you can refer to the following format: KerasNLP. (n.d.). Retrieved from <https://keras-nlp.github.io/>...
143 | ```
144 | 


--------------------------------------------------------------------------------
/graph_rag/experiments/artifacts/data_keras/index2.md:
--------------------------------------------------------------------------------
  1 | # KerasNLP
  2 | 
  3 | <a class="github-button" href="https://github.com/keras-team/keras-nlp" data-size="large" data-show-count="true" aria-label="Star keras-team/keras-nlp on GitHub">Star</a>
  4 | 
  5 | KerasNLP is a natural language processing library that works natively
  6 | with TensorFlow, JAX, or PyTorch. Built on Keras 3, these models, layers,
  7 | metrics, and tokenizers can be trained and serialized in any framework and
  8 | re-used in another without costly migrations.
  9 | 
 10 | KerasNLP supports users through their entire development cycle. Our workflows
 11 | are built from modular components that have state-of-the-art preset weights when
 12 | used out-of-the-box and are easily customizable when more control is needed.
 13 | 
 14 | This library is an extension of the core Keras API; all high-level modules are
 15 | [`Layers`](/api/layers/) or
 16 | [`Models`](/api/models/) that receive that same level of polish
 17 | as core Keras. If you are familiar with Keras, congratulations! You already
 18 | understand most of KerasNLP.
 19 | 
 20 | See our [Getting Started guide](/guides/keras_nlp/getting_started)
 21 | to start learning our API. We welcome
 22 | [contributions](https://github.com/keras-team/keras-nlp/blob/master/CONTRIBUTING.md).
 23 | 
 24 | ---
 25 | ## Quick links
 26 | 
 27 | * [KerasNLP API reference](/api/keras_nlp/)
 28 | * [KerasNLP on GitHub](https://github.com/keras-team/keras-nlp)
 29 | * [List of available pre-trained models](/api/keras_nlp/models/)
 30 | 
 31 | ## Guides
 32 | 
 33 | * [Getting Started with KerasNLP](/guides/keras_nlp/getting_started/)
 34 | * [Uploading Models with KerasNLP](/guides/keras_nlp/upload/)
 35 | * [Pretraining a Transformer from scratch](/guides/keras_nlp/transformer_pretraining/)
 36 | 
 37 | ## Examples
 38 | 
 39 | * [GPT-2 text generation](/examples/generative/gpt2_text_generation_with_kerasnlp/)
 40 | * [Parameter-efficient fine-tuning of GPT-2 with LoRA](/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora/)
 41 | * [Semantic Similarity](/examples/nlp/semantic_similarity_with_keras_nlp/)
 42 | * [Sentence embeddings using Siamese RoBERTa-networks](/examples/nlp/sentence_embeddings_with_sbert/)
 43 | * [Data Parallel Training with tf.distribute](/examples/nlp/data_parallel_training_with_keras_nlp/)
 44 | * [English-to-Spanish translation](/examples/nlp/neural_machine_translation_with_keras_nlp/)
 45 | * [GPT text generation from scratch](/examples/generative/text_generation_gpt/)
 46 | * [Text Classification using FNet](/examples/nlp/fnet_classification_with_keras_nlp/)
 47 | 
 48 | ---
 49 | ## Installation
 50 | 
 51 | KerasNLP supports both Keras 2 and Keras 3. We recommend Keras 3 for all new
 52 | users, as it enables using KerasNLP models and layers with JAX, TensorFlow and
 53 | PyTorch.
 54 | 
 55 | ### Keras 2 Installation
 56 | 
 57 | To install the latest KerasNLP release with Keras 2, simply run:
 58 | 
 59 | ```
 60 | pip install --upgrade keras-nlp
 61 | ```
 62 | 
 63 | ### Keras 3 Installation
 64 | 
 65 | There are currently two ways to install Keras 3 with KerasNLP. To install the
 66 | stable versions of KerasNLP and Keras 3, you should install Keras 3 **after**
 67 | installing KerasNLP. This is a temporary step while TensorFlow is pinned to
 68 | Keras 2, and will no longer be necessary after TensorFlow 2.16.
 69 | 
 70 | ```
 71 | pip install --upgrade keras-nlp
 72 | pip install --upgrade keras
 73 | ```
 74 | 
 75 | To install the latest nightly changes for both KerasNLP and Keras, you can use
 76 | our nightly package.
 77 | 
 78 | ```
 79 | pip install --upgrade keras-nlp-nightly
 80 | ```
 81 | 
 82 | **Note:** Keras 3 will not function with TensorFlow 2.14 or earlier.
 83 | 
 84 | See [Getting started with Keras](/getting_started/) for more information on
 85 | installing Keras generally and compatibility with different frameworks.
 86 | 
 87 | ---
 88 | ## Quickstart
 89 | 
 90 | Fine-tune BERT on a small sentiment analysis task using the
 91 | [`keras_nlp.models`](/api/keras_nlp/models/) API:
 92 | 
 93 | ```python
 94 | import os
 95 | os.environ["KERAS_BACKEND"] = "tensorflow"  # Or "jax" or "torch"!
 96 | 
 97 | import keras_nlp
 98 | import tensorflow_datasets as tfds
 99 | 
100 | imdb_train, imdb_test = tfds.load(
101 |     "imdb_reviews",
102 |     split=["train", "test"],
103 |     as_supervised=True,
104 |     batch_size=16,
105 | )
106 | # Load a BERT model.
107 | classifier = keras_nlp.models.BertClassifier.from_preset(
108 |     "bert_base_en_uncased", 
109 |     num_classes=2,
110 | )
111 | # Fine-tune on IMDb movie reviews.
112 | classifier.fit(imdb_train, validation_data=imdb_test)
113 | # Predict two new examples.
114 | classifier.predict(["What an amazing movie!", "A total waste of my time."])
115 | ```
116 | 
117 | ---
118 | ## Compatibility
119 | 
120 | We follow [Semantic Versioning](https://semver.org/), and plan to
121 | provide backwards compatibility guarantees both for code and saved models built
122 | with our components. While we continue with pre-release `0.y.z` development, we
123 | may break compatibility at any time and APIs should not be consider stable.
124 | 
125 | ## Disclaimer
126 | 
127 | KerasNLP provides access to pre-trained models via the `keras_nlp.models` API.
128 | These pre-trained models are provided on an "as is" basis, without warranties
129 | or conditions of any kind. The following underlying models are provided by third
130 | parties, and subject to separate licenses:
131 | BART, DeBERTa, DistilBERT, GPT-2, OPT, RoBERTa, Whisper, and XLM-RoBERTa.
132 | 
133 | ## Citing KerasNLP
134 | 
135 | If KerasNLP helps your research, we appreciate your citations.
136 | Here is the BibTeX entry:
137 | 
138 | ```bibtex
139 | @misc{kerasnlp2022,
140 |   title={KerasNLP},
141 |   author={Watson, Matthew, and Qian, Chen, and Bischof, Jonathan and Chollet, 
142 |   Fran\c{c}ois and others},
143 |   year={2022},
144 |   howpublished={\url{https://github.com/keras-team/keras-nlp}},
145 | }
146 | ```
147 | 


--------------------------------------------------------------------------------
/project_explainer/gh_explainer/summarize.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
  3 | from gh_processor import (download_github_repo, 
  4 |                                extract_project_description_from_readme, 
  5 |                                extract_headings_with_paragraphs_from_markdown,
  6 |                                remove_tables_from_markdown,
  7 |                                remove_code_blocks_from_markdown,
  8 |                                remove_images_from_markdown,
  9 |                                remove_links_from_markdown)
 10 | import os
 11 | from jinja2 import Template
 12 | 
 13 | 
 14 | class Explainer():
 15 |     def __init__(self, base_model_id: str, device: str = "cpu") -> None:
 16 |         """
 17 |         Initializes the Explainer object.
 18 | 
 19 |         Args:
 20 |             base_model_id: The ID or path to the base model.
 21 |             device: The device to use for model inference (default is "cpu").
 22 | 
 23 |         Raises:
 24 |             ValueError: If the provided base model ID or path is invalid.
 25 |         """
 26 |         self.base_model_id = base_model_id
 27 |         self.device = device
 28 |         self.tokenizer=AutoTokenizer.from_pretrained(base_model_id)
 29 |         try:
 30 |             # support decoder only models
 31 |             if self.device == "cuda":
 32 |                 self.model=AutoModelForCausalLM.from_pretrained(base_model_id, return_dict=True).to("cuda")
 33 |             else:
 34 |                 self.model=AutoModelForCausalLM.from_pretrained(base_model_id, return_dict=True)
 35 |             self.brief_prompt_template = "{{ prompt }}\nExplain the above : "
 36 |         except Exception as e:
 37 |             # support encoder decoder models
 38 |             try:
 39 |                 if self.device == "cuda":
 40 |                     self.model=AutoModelForSeq2SeqLM.from_pretrained(base_model_id, return_dict=True).to("cuda")
 41 |                 else:
 42 |                     self.model=AutoModelForSeq2SeqLM.from_pretrained(base_model_id, return_dict=True)
 43 |                 self.brief_prompt_template = "summarize: {{ prompt }}"
 44 |             except Exception as e2:
 45 |                 raise ValueError(str(e), str(e2))
 46 | 
 47 |     def _fill_template(self, template_string: str, variables: dict) -> str:
 48 |         """
 49 |         Fills in variables in a template string using the provided dictionary and returns the filled template.
 50 | 
 51 |         Args:
 52 |             template_string: The template string with variables to be filled.
 53 |             variables: A dictionary containing the variable names and their corresponding values.
 54 | 
 55 |         Returns:
 56 |             The filled template string.
 57 | 
 58 |         Raises:
 59 |             TypeError: If the template_string is not a string or variables is not a dictionary.
 60 |         """
 61 |         template = Template(template_string)
 62 |         filled_template = template.render(variables)
 63 |         return filled_template
 64 |     
 65 |     def _model_gen(self, prompt: str) -> str:
 66 |         """
 67 |         Generates a response using a hugging face transformer model based on the provided prompt.
 68 | 
 69 |         Args:
 70 |             prompt: The input prompt for generating the response.
 71 | 
 72 |         Returns:
 73 |             The generated response as a string.
 74 | 
 75 |         Raises:
 76 |             TypeError: If the prompt is not a string.
 77 |         """
 78 |         inputs=self.tokenizer.encode(prompt, return_tensors='pt', max_length=1024, truncation=True)
 79 |         output = self.model.generate(inputs, min_length=256, max_length=512)
 80 |         return self.tokenizer.decode(output[0], skip_special_tokens=True)
 81 | 
 82 |     def brief(self, github_url: str, branch: str = "main") -> dict:
 83 |         """
 84 |         Generates a brief summary of a project based on its README file.
 85 | 
 86 |         Args:
 87 |             github_url: The URL of the GitHub repository.
 88 |             branch: The branch name to download (default is "main").
 89 | 
 90 |         Returns:
 91 |             A dictionary containing the original prompt, prepared prompt, and the generated summary.
 92 | 
 93 |         Raises:
 94 |             ValueError: If the README.md file is not found.
 95 |         """
 96 |         repo_path = download_github_repo(github_url, branch)
 97 |         readme_path = os.path.join(repo_path, "README.md")
 98 |         if not os.path.exists(readme_path):
 99 |             raise ValueError("README.md not found")
100 |         project_description = extract_project_description_from_readme(readme_path)
101 |         prompt = {"prompt": project_description}
102 |         prepared_prompt = self._fill_template(self.brief_prompt_template, prompt)
103 |         summary=self._model_gen(prepared_prompt)
104 |         return {"prompt": prompt, "prepared_prompt": prepared_prompt, "summary": str(summary)}
105 |     
106 |     def outline(self, github_url: str, branch: str = "main") -> dict:
107 |         """
108 |         Generates an outline of a project based on its README file.
109 | 
110 |         Args:
111 |             github_url: The URL of the GitHub repository.
112 |             branch: The branch name to download (default is "main").
113 | 
114 |         Returns:
115 |             A dictionary containing the outline with headings as keys and generated summaries as values.
116 | 
117 |         Raises:
118 |             ValueError: If the README.md file is not found.
119 |         """
120 |         repo_path = download_github_repo(github_url, branch)
121 |         readme_path = os.path.join(repo_path, "README.md")
122 |         if not os.path.exists(readme_path):
123 |             raise ValueError("README.md not found")
124 |         headings_and_paras = extract_headings_with_paragraphs_from_markdown(readme_path)
125 |         outline_dict = {}
126 |         for key,  value in headings_and_paras.items():
127 |             content = remove_code_blocks_from_markdown(remove_images_from_markdown(remove_links_from_markdown(remove_tables_from_markdown(value))))
128 |             prompt = {"prompt": content}
129 |             prepared_prompt = self._fill_template(self.brief_prompt_template, prompt)
130 |             outline_dict[key] = self._model_gen(prepared_prompt)
131 |         return outline_dict
132 | 


--------------------------------------------------------------------------------
/graph_rag/evaluation/random/results_5.csv:
--------------------------------------------------------------------------------
 1 | question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_precision,context_recall
 2 | "What is mixed precision in computing?
 3 | ","['Examples\n\n* GPT-2 text generation\n* Parameter-efficient fine-tuning of GPT-2 with LoRA\n* Semantic Similarity\n* Sentence embeddings using Siamese RoBERTa-networks\n* Data Parallel Training with tf.distribute\n* English-to-Spanish translation\n* GPT text generation from scratch\n* Text Classification using FNet\n\n---'
 4 |  ""Supported hardware\n\nWhile mixed precision will run on most hardware, it will only speed up models on recent NVIDIA GPUs and Google TPUs.\nNVIDIA GPUs support using a mix of float16 and float32, while TPUs support a mix of bfloat16 and float32.\n\nAmong NVIDIA GPUs, those with compute capability 7.0 or higher will see the greatest performance benefit\nfrom mixed precision because they have special hardware units, called Tensor Cores,\nto accelerate float16 matrix multiplications and convolutions. Older GPUs offer no math\nperformance benefit for using mixed precision, however memory and bandwidth savings can enable some speedups.\nYou can look up the compute capability for your GPU at NVIDIA's CUDA GPU web page.\nExamples of GPUs that will benefit most from mixed precision include RTX GPUs, the V100, and the A100.\n\nEven on CPUs and older GPUs, where no speedup is expected, mixed precision APIs can still be used for unit testing,\ndebugging, or just to try out the API. On CPUs, mixed precision will run significantly slower, however.\n\n\nYou can check your GPU type with the following command:\n\n```\nnvidia-smi -L\n```""
 5 |  ""The following are knowledge sequence in max depth 2 in the form of directed graph like:\n`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`\n['Mixed precision', 'Will run on most hardware', 'But will only speed up models on recent nvidia gpus and google tpus']""]","Mixed precision is a technique used to improve the efficiency of computer computations by using lower-precision data types (e.g., float16 or bfloat16) for some parts of the computation while still maintaining accuracy. It is commonly used in deep learning and other numerical compute-intensive applications, where memory bandwidth and computational resources are limited.
 6 | 
 7 | While mixed precision will run on most hardware, it will only speed up models on recent NVIDIA GPUs and Google TPUs. Older GPUs offer no math performance benefit for using mixed precision, however memory and bandwidth savings can enable some speedups. You can look up the compute capability for your GPU at NVIDIA's CUDA GPU web page to determine whether it will benefit from mixed precision. Examples of GPUs that will benefit most from mixed precision include RTX GPUs, the V100, and the A100.",A combination of different numerical precisions used to optimize memory usage and computation speed.,0.16666666666666666,0.9818589787934606,0.0,0.6666666666666666
 8 | "What is the title of the guide discussed in the given context?
 9 | ","['Available guides\n\n{{toc}}'
10 |  'Hyperparameter Tuning\n\nThese guides cover KerasTuner best practices.']","The title of the guide discussed in the given context is ""Hyperparameter Tuning""",How to distribute training,0.0,0.9999999999999996,0.0,1.0
11 | "What is Keras 3?
12 | ",['No relationships found.'],"Keras 3 is a new version of the popular deep learning framework Keras that was released in October 2020. It introduces several new features and improvements over the previous versions, including support for more complex architectures, improved performance on certain tasks, and easier integration with other libraries and frameworks.","A deep learning framework that works with TensorFlow, JAX, and PyTorch interchangeably.",0.0,0.9747114512069875,0.0,0.5
13 | "What was the percentage boost in StableDiffusion training throughput with Keras 3?
14 | ","['A first example: A MNIST convnet\n\nLet\'s start with the Hello World of ML: training a convnet\nto classify MNIST digits.\n\nHere\'s the data:\n\n\n```python\n# Load the data and split it between train and test sets\n(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()\n\n# Scale images to the [0, 1] range\nx_train = x_train.astype(""float32"") / 255\nx_test = x_test.astype(""float32"") / 255\n# Make sure images have shape (28, 28, 1)\nx_train = np.expand_dims(x_train, -1)\nx_test = np.expand_dims(x_test, -1)\nprint(""x_train shape:"", x_train.shape)\nprint(""y_train shape:"", y_train.shape)\nprint(x_train.shape[0], ""train samples"")\nprint(x_test.shape[0], ""test samples"")\n```\n\n\n```\nx_train shape: (60000, 28, 28, 1)\ny_train shape: (60000,)\n60000 train samples\n10000 test samples\n\n```\n\nHere\'s our model.\n\nDifferent model-building options that Keras offers include:\n\n- The Sequential API (what we use below)\n- The Functional API (most typical)\n- Writing your own models yourself via subclassing (for advanced use cases)\n\n\n```python\n# Model parameters\nnum_classes = 10\ninput_shape = (28, 28, 1)\n\nmodel = keras.Sequential(\n    [\n        keras.layers.Input(shape=input_shape),\n        keras.layers.Conv2D(64, kernel_size=(3, 3), activation=""relu""),\n        keras.layers.Conv2D(64, kernel_size=(3, 3), activation=""relu""),\n        keras.layers.MaxPooling2D(pool_size=(2, 2)),\n        keras.layers.Conv2D(128, kernel_size=(3, 3), activation=""relu""),\n        keras.layers.Conv2D(128, kernel_size=(3, 3), activation=""relu""),\n        keras.layers.GlobalAveragePooling2D(),\n        keras.layers.Dropout(0.5),\n        keras.layers.Dense(num_classes, activation=""softmax""),\n    ]\n)\n```\n\nHere\'s our model summary:\n\n\n```python\nmodel.summary()\n```\n\n\nModel: ""sequential""']","The percentage boost in StableDiffusion training throughput with Keras 3 is not mentioned in the provided context information. Therefore, I cannot answer this question.",Over 150%,0.0,0.9705647972395562,0.9999999999,1.0
15 | "What are some examples of pretrained models available in KerasCV and KerasNLP?
16 | ",['No relationships found.'],"Some examples of pre-trained models available in KerasCV include the InceptionV3 model, the ResNet50 model, and the VGG16 model. Some examples of pre-trained models available in KerasNLP include the BERT model, the RoBERTa model, and the DistilBERT model. These models can be used for a variety of natural language processing tasks such as text classification, sentiment analysis, named entity recognition, and question answering.","BERT, OPT, Whisper, T5, StableDiffusion, YOLOv8, SegmentAnything, etc.",0.0,0.9897694771234743,0.0,0.8571428571428571
17 | 


--------------------------------------------------------------------------------
/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script prepares data from a repository for training a P-tuning model using the PEFT library.
  3 | It reads source files, processes them into tokenized chunks, and trains a language model using the specified configuration.
  4 | 
  5 | Functions:
  6 | - prepare_data: Collects files from a repository, concatenates their content, and saves it to an output file.
  7 | - data_for_training: Tokenizes the concatenated content and prepares it for language model training.
  8 | - get_peft_model: Initializes and configures a P-tuning model using the specified configuration.
  9 | - create_training_arguments: Generates training arguments for the Trainer using the configuration settings.
 10 | - create_trainer: Creates a Trainer object with the model, data, and training arguments.
 11 | - main: Parses the YAML configuration file and runs the training process.
 12 | 
 13 | Requirements:
 14 | - A YAML configuration file that specifies model, training, and data parameters.
 15 | """
 16 | 
 17 | import argparse
 18 | import yaml
 19 | import os
 20 | import glob
 21 | from datasets import Dataset
 22 | from transformers import Trainer, DataCollatorForLanguageModeling
 23 | from transformers import AutoModelForCausalLM, AutoTokenizer
 24 | from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit
 25 | from transformers import TrainingArguments
 26 | 
 27 | 
 28 | def prepare_data(repo_path: str, extensions: list, output_file: str):
 29 |     """
 30 |     Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file.
 31 | 
 32 |     Args:
 33 |         repo_path: Path to the repository to collect files from.
 34 |         extensions: List of file extensions to include in the data preparation.
 35 |         output_file: Path to the output file where the concatenated content will be saved.
 36 | 
 37 |     Returns:
 38 |         A string containing the entire content written to the output file.
 39 |     """
 40 | 
 41 |     files = []
 42 |     for ext in extensions:
 43 |         files.extend(
 44 |             glob.glob(os.path.join(repo_path, "**", f"*.{ext}"), recursive=True)
 45 |         )
 46 | 
 47 |     with open(output_file, "w", encoding="utf-8") as outfile:
 48 |         for path in files:
 49 |             with open(path, "r", encoding="utf-8") as file:
 50 |                 content = file.read()
 51 |                 outfile.write(f"### {path} ###\n")
 52 |                 outfile.write(content)
 53 |                 outfile.write("\n\n")
 54 | 
 55 |     with open(output_file, "r") as f:
 56 |         return f.read()
 57 | 
 58 | 
 59 | def data_for_training(content: str, config: dict):
 60 |     """
 61 |     Tokenizes the content and prepares it for language model training, including creating a data collator.
 62 | 
 63 |     Args:
 64 |         content: The concatenated text content to be tokenized.
 65 |         config: Dictionary containing the model and training configuration.
 66 | 
 67 |     Returns:
 68 |         A tuple containing the tokenized dataset and the data collator for language model training.
 69 |     """
 70 | 
 71 |     tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"])
 72 |     context_length = config["Model"]["context_length"]
 73 |     outputs = tokenizer(
 74 |         content,
 75 |         truncation=True,
 76 |         max_length=context_length,
 77 |         return_overflowing_tokens=True,
 78 |         return_length=True,
 79 |     )
 80 |     print(f"Input IDs length: {len(outputs['input_ids'])}")
 81 |     print(f"Input chunk lengths: {outputs['length']}")
 82 |     print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
 83 |     ds = Dataset.from_dict(outputs)
 84 |     ds_removed = ds.remove_columns(
 85 |         ["attention_mask", "length", "overflow_to_sample_mapping"]
 86 |     )
 87 |     tokenizer.pad_token = tokenizer.eos_token
 88 |     data_collator = DataCollatorForLanguageModeling(
 89 |         tokenizer, mlm=config["Training"]["masked_language_modelling"]
 90 |     )
 91 |     return ds_removed, data_collator
 92 | 
 93 | 
 94 | def get_peft_model(config: dict):
 95 |     """
 96 |     Initializes and configures a P-tuning model using the specified foundational model and prompt tuning configuration.
 97 | 
 98 |     Args:
 99 |         config: Dictionary containing the model and training configuration.
100 | 
101 |     Returns:
102 |         A P-tuned model ready for training.
103 |     """
104 | 
105 |     foundational_model = AutoModelForCausalLM.from_pretrained(
106 |         config["Model"]["model"], trust_remote_code=True
107 |     )
108 |     generation_config = PromptTuningConfig(
109 |         task_type=TaskType.CAUSAL_LM,
110 |         prompt_tuning_init=PromptTuningInit.RANDOM,
111 |         num_virtual_tokens=config["Training"]["num_virtual_tokens"],
112 |         tokenizer_name_or_path=config["Model"]["model"],
113 |     )
114 |     peft_model_prompt = get_peft_model(foundational_model, generation_config)
115 |     peft_model_prompt.print_trainable_parameters()
116 |     return peft_model_prompt
117 | 
118 | 
119 | def create_training_arguments(config: dict):
120 |     """
121 |     Creates and configures the training arguments for the Trainer object.
122 | 
123 |     Args:
124 |         config: Dictionary containing the training configuration.
125 | 
126 |     Returns:
127 |         A TrainingArguments object with the specified settings.
128 |     """
129 | 
130 |     training_args = TrainingArguments(
131 |         output_dir=config["Training"]["output_dir"],
132 |         save_strategy="steps",
133 |         per_device_train_batch_size=config["Training"]["batch_size"],
134 |         auto_find_batch_size=config["Training"]["auto_batch_size"],
135 |         learning_rate=config["Training"]["learning_rate"],
136 |         num_train_epochs=config["Training"]["num_epochs"],
137 |         push_to_hub=config["Training"]["push_to_hub"],
138 |     )
139 |     return training_args
140 | 
141 | 
142 | def create_trainer(
143 |     config: dict, train_data: object, data_collator: object, model: object
144 | ):
145 |     """
146 |     Creates a Trainer object for training the model with the provided data and configuration.
147 | 
148 |     Args:
149 |         config: Dictionary containing the training configuration.
150 |         train_data: The tokenized dataset to be used for training hf Dataset object.
151 |         data_collator: The data collator for handling the tokenized data during training.
152 |         model: The P-tuned model to be trained.
153 | 
154 |     Returns:
155 |         A Trainer object configured for training the model.
156 |     """
157 | 
158 |     training_args = create_training_arguments(config)
159 |     trainer = Trainer(
160 |         model=model,
161 |         args=training_args,
162 |         train_dataset=train_data,
163 |         data_collator=data_collator,
164 |     )
165 |     return trainer
166 | 
167 | 
168 | def main():
169 |     """
170 |     Main function to execute the training pipeline. It parses the YAML configuration file, prepares the data, initializes
171 |     the model, and starts the training process.
172 |     """
173 |     parser = argparse.ArgumentParser(description="Training script for P-tuning model")
174 |     parser.add_argument(
175 |         "--config", type=str, required=True, help="Path to the YAML configuration file"
176 |     )
177 |     args = parser.parse_args()
178 | 
179 |     with open(args.config, "r") as file:
180 |         config = yaml.safe_load(file)
181 | 
182 |     content = prepare_data(
183 |         config["Data"]["repo_path"],
184 |         config["Data"]["extensions"],
185 |         config["Data"]["output_file"],
186 |     )
187 | 
188 |     train_data, data_collator = data_for_training(content, config)
189 |     model = get_peft_model(config)
190 |     trainer = create_trainer(config, train_data, data_collator, model)
191 | 
192 |     trainer.train()
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     main()
197 | 


--------------------------------------------------------------------------------
/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script facilitates the fine-tuning of a language model using QLoRA (Quantized Low-Rank Adapter)
  3 | adapter tuning.
  4 | 
  5 | The main functionalities include:
  6 | - Preparing data from a specified repository with specific file extensions.
  7 | - Tokenizing the data for model training.
  8 | - Loading and configuring a pre-trained language model.
  9 | - Applying PEFT (Parameter-Efficient Fine-Tuning) using QLoRA.
 10 | - Defining training arguments and creating a Trainer instance.
 11 | - Executing the training process with the Trainer.
 12 | 
 13 | Requirements:
 14 | - A YAML configuration file that specifies model, training, and data parameters.
 15 | """
 16 | 
 17 | import argparse
 18 | import yaml
 19 | import os
 20 | import glob
 21 | import torch
 22 | from datasets import Dataset
 23 | from transformers import Trainer, DataCollatorForLanguageModeling
 24 | from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
 25 | from transformers import (
 26 |     AutoModelForCausalLM,
 27 |     AutoTokenizer,
 28 |     Trainer,
 29 |     TrainingArguments,
 30 |     BitsAndBytesConfig,
 31 | )
 32 | 
 33 | 
 34 | def prepare_data(repo_path: str, extensions: list, output_file: str):
 35 |     """
 36 |     Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file.
 37 | 
 38 |     Args:
 39 |         repo_path: Path to the repository to collect files from.
 40 |         extensions: List of file extensions to include in the data preparation.
 41 |         output_file: Path to the output file where the concatenated content will be saved.
 42 | 
 43 |     Returns:
 44 |         A string containing the entire content written to the output file.
 45 |     """
 46 | 
 47 |     files = []
 48 |     for ext in extensions:
 49 |         files.extend(
 50 |             glob.glob(os.path.join(repo_path, "**", f"*.{ext}"), recursive=True)
 51 |         )
 52 | 
 53 |     with open(output_file, "w", encoding="utf-8") as outfile:
 54 |         for path in files:
 55 |             with open(path, "r", encoding="utf-8") as file:
 56 |                 content = file.read()
 57 |                 outfile.write(f"### {path} ###\n")
 58 |                 outfile.write(content)
 59 |                 outfile.write("\n\n")
 60 | 
 61 |     with open(output_file, "r") as f:
 62 |         return f.read()
 63 | 
 64 | 
 65 | def data_for_training(content: str, config: dict):
 66 |     """
 67 |     Tokenizes the content and prepares it for language model training, including creating a data collator.
 68 | 
 69 |     Args:
 70 |         content: The concatenated text content to be tokenized.
 71 |         config: Dictionary containing the model and training configuration.
 72 | 
 73 |     Returns:
 74 |         A tuple containing the tokenized dataset,tokenizer,data collator for language model training.
 75 |     """
 76 |     tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"])
 77 |     context_length = config["Model"]["context_length"]
 78 |     outputs = tokenizer(
 79 |         content,
 80 |         truncation=True,
 81 |         max_length=context_length,
 82 |         return_overflowing_tokens=True,
 83 |         return_length=True,
 84 |     )
 85 |     print(f"Input IDs length: {len(outputs['input_ids'])}")
 86 |     print(f"Input chunk lengths: {outputs['length']}")
 87 |     print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
 88 |     ds = Dataset.from_dict(outputs)
 89 |     ds_removed = ds.remove_columns(
 90 |         ["attention_mask", "length", "overflow_to_sample_mapping"]
 91 |     )
 92 |     tokenizer.pad_token = tokenizer.eos_token
 93 |     data_collator = DataCollatorForLanguageModeling(
 94 |         tokenizer, mlm=config["Training"]["masked_language_modelling"]
 95 |     )
 96 |     return ds_removed, data_collator, tokenizer
 97 | 
 98 | 
 99 | def load_base_model(config: dict):
100 |     """
101 |     Loads the base language model with specified configurations, including quantization settings.
102 | 
103 |     Args:
104 |         config: The configuration dictionary containing model and BNB (BitsAndBytes) parameters.
105 | 
106 |     Returns:
107 |         PreTrainedModel: The loaded pre-trained language model ready for training.
108 |     """
109 | 
110 |     compute_dtype = getattr(torch, config["BNB_CONFIG"]["BNB_4BIT_COMPUTE_DTYPE"])
111 | 
112 |     bnb_config = BitsAndBytesConfig(
113 |         load_in_4bit=True,
114 |         bnb_4bit_quant_type="nf4",
115 |         bnb_4bit_compute_dtype=compute_dtype,
116 |         bnb_4bit_use_double_quant=config["BNB_CONFIG"]["USE_NESTED_QUANT"],
117 |     )
118 |     device_map = {"": 0}
119 | 
120 |     model = AutoModelForCausalLM.from_pretrained(
121 |         config["MODEL"]["MODEL"],
122 |         load_in_8bit=config["MODEL"]["LOAD_IN_8BIT"],
123 |         quantization_config=bnb_config,
124 |         device_map=device_map,
125 |         use_cache=False,
126 |         trust_remote_code=True,
127 |     )
128 |     return model
129 | 
130 | 
131 | def load_peft_model(model: object, config: dict):
132 |     """
133 |     Applies PEFT (Parameter-Efficient Fine-Tuning) using QLoRA to the given model.
134 | 
135 |     Args:
136 |         model: The pre-trained language model to be fine-tuned.
137 |         config: The configuration dictionary containing LORA (Low-Rank Adapter) parameters.
138 | 
139 |     Returns:
140 |         PreTrainedModel: The PEFT-configured model ready for training.
141 |     """
142 | 
143 |     model = prepare_model_for_kbit_training(model)
144 |     peft_config = LoraConfig(
145 |         lora_alpha=config["LORA"]["LORA_ALPHA"],
146 |         lora_dropout=config["LORA"]["LORA_DROPOUT"],
147 |         r=config["LORA"]["LORA_R"],
148 |         bias="none",
149 |         task_type="CAUSAL_LM",
150 |         # target_modules=,
151 |     )
152 |     model = get_peft_model(model, peft_config)
153 |     model.print_trainable_parameters()
154 |     return model
155 | 
156 | 
157 | def create_training_arguments(config: dict):
158 |     """
159 |     Creates and returns the training arguments for the Trainer.
160 | 
161 |     Args:
162 |         config: The configuration dictionary containing training arguments.
163 | 
164 |     Returns:
165 |         TrainingArguments: The configured training arguments.
166 |     """
167 | 
168 |     training_args = TrainingArguments(
169 |         output_dir=f"results/{config['TRAINING_ARGUMENTS']['OUTPUT_DIR']}",
170 |         num_train_epochs=3,
171 |         dataloader_drop_last=True,
172 |         evaluation_strategy="steps",
173 |         save_strategy="steps",
174 |         eval_steps=config["TRAINING_ARGUMENTS"]["EVAL_FREQ"],
175 |         save_steps=config["TRAINING_ARGUMENTS"]["SAVE_FREQ"],
176 |         logging_steps=config["TRAINING_ARGUMENTS"]["LOG_FREQ"],
177 |         per_device_train_batch_size=64,
178 |         per_device_eval_batch_size=64,
179 |         learning_rate=config["TRAINING_ARGUMENTS"]["LR"],
180 |         lr_scheduler_type=config["TRAINING_ARGUMENTS"]["LR_SCHEDULER_TYPE"],
181 |         warmup_steps=config["TRAINING_ARGUMENTS"]["NUM_WARMUP_STEPS"],
182 |         gradient_accumulation_steps=config["TRAINING_ARGUMENTS"]["GR_ACC_STEPS"],
183 |         gradient_checkpointing=True,
184 |         fp16=config["TRAINING_ARGUMENTS"]["FP16"],
185 |         bf16=config["TRAINING_ARGUMENTS"]["BF16"],
186 |         weight_decay=config["TRAINING_ARGUMENTS"]["WEIGHT_DECAY"],
187 |         # push_to_hub=True,
188 |         include_tokens_per_second=True,
189 |     )
190 |     return training_args
191 | 
192 | 
193 | def create_trainer(
194 |     tokenizer: object, train_data: object, data_collator: object, model: object
195 | ):
196 |     """
197 |     Creates a Trainer instance with the provided tokenizer, training data, data collator, and model.
198 | 
199 |     Args:
200 |         tokenizer: The tokenizer to be used during training.
201 |         train_data : The tokenized training dataset.
202 |         data_collator: The data collator for language modeling.
203 |         model : The pre-trained and fine-tuned model.
204 | 
205 |     Returns:
206 |         Trainer: The Trainer instance for model training.
207 |     """
208 |     training_args = create_training_arguments()
209 |     trainer = Trainer(
210 |         model=model,
211 |         tokenizer=tokenizer,
212 |         args=training_args,
213 |         data_collator=data_collator,
214 |         train_dataset=train_data,
215 |         eval_dataset=train_data,
216 |     )
217 |     return trainer
218 | 
219 | 
220 | def main():
221 |     """
222 |     The main function that orchestrates the data preparation, model loading,
223 |     and training processes using the provided YAML configuration.
224 |     """
225 | 
226 |     parser = argparse.ArgumentParser(
227 |         description="Training script for QLoRA adapter tuning"
228 |     )
229 |     parser.add_argument(
230 |         "--config", type=str, required=True, help="Path to the YAML configuration file"
231 |     )
232 |     args = parser.parse_args()
233 | 
234 |     with open(args.config, "r") as file:
235 |         config = yaml.safe_load(file)
236 | 
237 |     content = prepare_data(
238 |         config["Data"]["repo_path"],
239 |         config["Data"]["extensions"],
240 |         config["Data"]["output_file"],
241 |     )
242 | 
243 |     train_data, data_collator, tokenizer = data_for_training(content, config)
244 |     model = load_base_model(config)
245 |     model = load_peft_model(model, config)
246 |     trainer = create_trainer(config, tokenizer, train_data, data_collator, model)
247 | 
248 |     trainer.train()
249 | 
250 | 
251 | if __name__ == "__main__":
252 |     main()
253 | 


--------------------------------------------------------------------------------
/graph_rag/evaluation/README.MD:
--------------------------------------------------------------------------------
  1 | 
  2 | # Knowledge Graph Evaluation
  3 | 
  4 | This module provides methods to evaluate the performance of GraphRag. The following integrations are available for evaluation:
  5 | 
  6 | - **Llama-Index Evaluation Pack**
  7 | - **Ragas Evaluation Pack**
  8 | 
  9 | Additionally, this module includes scripts for creating custom test datasets to benchmark and evaluate GraphRag.
 10 | 
 11 | ## Getting Started
 12 | This section demonstrates how to use the functions provided in the module:
 13 | 
 14 | ---
 15 | 
 16 |  ### 1. QA Generation and Critique
 17 | 
 18 | This module offers tools to generate question-answer (QA) pairs from input documents using a language model and critique them based on various criteria like groundedness, relevance, and standalone quality.
 19 | 
 20 | > #### Generate and Critique QA Pairs
 21 | 
 22 | To use this module, follow these steps:
 23 | 
 24 | #### 1. Generate QA Pairs
 25 | 
 26 | First, we prepare our dataset for generating QA pairs. In this example, we'll use Keras-IO documentation and Llama-Index's `SimpleDirectoryReader` to obtain `Document` objects.
 27 | 
 28 | ```python
 29 | !git clone https://github.com/keras-team/keras-io.git
 30 | 
 31 | def get_data(input_dir="path/to/keras-io/templates"):
 32 |     reader = SimpleDirectoryReader(
 33 |         input_dir, 
 34 |         recursive=True, 
 35 |         exclude=["path/to/keras-io/templates/examples"]
 36 |     )
 37 |     docs = reader.load_data()
 38 | 
 39 |     splitter = SentenceSplitter(
 40 |         chunk_size=300,
 41 |         chunk_overlap=20,
 42 |     )
 43 |     nodes = splitter.get_nodes_from_documents(docs)
 44 |     documents = [Document(text=node.text, metadata=node.metadata) for node in nodes]
 45 |     
 46 |     return docs
 47 |     
 48 | # load the documents
 49 | documents=get_data()
 50 | ```
 51 | 
 52 | Use the `qa_generator` function to generate QA pairs from your input documents.
 53 | 
 54 | ```python
 55 | from evaluation.ragas_evaluation.QA_graphrag_testdataset import qa_generator
 56 | 
 57 | N_GENERATIONS = 20
 58 | 
 59 | # Generate the QA pairs
 60 | qa_pairs = qa_generator(documents, N_GENERATIONS)
 61 | ```
 62 | 
 63 | #### 2. Critique the Generated QA Pairs
 64 | 
 65 | Once you have generated the QA pairs, critique them using the `critique_qa` function.
 66 | 
 67 | ```python
 68 | from evaluation.ragas_evaluation.QA_graphrag_testdatasete import critique_qa
 69 | 
 70 | # Critique the generated QA pairs
 71 | critiqued_qa_pairs = critique_qa(qa_pairs)
 72 | 
 73 | # The critiqued pairs will include scores and evaluations for groundedness, relevance, and standalone quality
 74 | ```
 75 | 
 76 | ---
 77 | ### 2. Evaluating Your Knowledge Graph with Llama-Index Evaluator Pack
 78 | 
 79 | This section demonstrates how to evaluate the performance of your query engine using the Llama-Index RAG evaluator pack.
 80 | 
 81 | > #### Evaluate Your Knowledge Graph with llama-index
 82 | 
 83 | To evaluate your query engine, follow these steps:
 84 | ```shell
 85 | llamaindex-cli download-llamadataset PaulGrahamEssayDataset --download-dir ./data
 86 | ```
 87 | 
 88 | ```python
 89 | from evaluation.evaluation_llama_index import evaluate
 90 | 
 91 | 
 92 | # Path to your labeled RAG dataset
 93 | RAG_DATASET = "./data/rag_dataset.json"
 94 | 
 95 | # Define the language model and embedding
 96 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 97 | from llama_index.llms.ollama import Ollama
 98 | 
 99 | llm = Ollama(base_url="http://localhost:11434", model="llama2")
100 | embedding = HuggingFaceEmbedding(model_name="microsoft/codebert-base")
101 | 
102 | # Your query engine instance
103 | from graph_rag.graph_retrieval.graph_retrieval import get_index_from_pickle, get_query_engine
104 | 
105 | index = get_index_from_pickle("path/to/graphIndex.pkl")
106 | query_engine = get_query_engine(index)
107 | 
108 | # Evaluate the dataset
109 | evaluation_results = evaluate(RAG_DATASET, query_engine)
110 | 
111 | # Review the results
112 | print(evaluation_results)
113 | ```
114 | | Metrics                      | RAG        | Base RAG  |
115 | |------------------------------|------------|-----------|
116 | | **Mean Correctness Score**    | 3.340909   |         0.934  |
117 | | **Mean Relevancy Score**      | 0.750000   |    4.239       |
118 | | **Mean Faithfulness Score**   | 0.386364   |   0.977        |
119 | | **Mean Context Similarity Score** | 0.948765 |     0.977      |
120 | 
121 | 
122 | 
123 | This example shows how to quickly evaluate your query engine's performance using the Llama-Index RAG evaluator pack.
124 | 
125 | 
126 | ---
127 | ### 3. Evaluating Your Knowledge Graph with Ragas backend
128 | 
129 | You can easily evaluate the performance of your query engine using this module.
130 | 
131 | > #### Load and Evaluate Your Dataset with ragas
132 | 
133 | Use the `load_test_dataset` function to load your dataset and directly evaluate it using the `evaluate` function. This method handles all necessary steps, including batching the data.
134 | 
135 | ```python
136 | from evaluation.ragas_evaluation.evaluation_ragas load_test_dataset, evaluate
137 | 
138 | # Step 1: Load the dataset from a pickle file
139 | dataset_path = "/content/keras_docs_embedded.pkl"
140 | test_dataset = load_test_dataset(dataset_path)
141 | ```
142 | 
143 | > **Note:** `test_dataset` is a list of Llama-Index `Document` objects.
144 | 
145 | ```python
146 | # Step 2: Define the language model and embedding
147 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding
148 | from llama_index.llms.ollama import Ollama
149 | 
150 | llm = Ollama(base_url="http://localhost:11434", model="codellama")
151 | embedding = HuggingFaceEmbedding(model_name="microsoft/codebert-base")
152 | 
153 | # Step 3: Specify the metrics for evaluation
154 | metrics = [faithfulness, answer_relevancy, context_precision, context_recall]
155 | 
156 | # Step 4: Load the query engine (Llama-Index)
157 | from graph_rag.graph_retrieval.graph_retrieval import get_index_from_pickle, get_query_engine
158 | 
159 | index = get_index_from_pickle("path/to/graphIndex.pkl")
160 | query_engine = get_query_engine(index)
161 | 
162 | # Step 5: Evaluate the dataset
163 | evaluation_results = evaluate(
164 |     query_engine=query_engine,
165 |     dataset=test_dataset,
166 |     llm=llm,
167 |     embeddings=embedding,
168 |     metrics=metrics,
169 |     # Default batch size is 4
170 | )
171 | ```
172 | 
173 | **Output:**
174 | ```python
175 | {'faithfulness': 0.0333, 'answer_relevancy': 0.9834, 'context_precision': 0.2000, 'context_recall': 0.8048}
176 | ```
177 | 
178 | ```python
179 | rdf = evaluation_results.to_pandas()
180 | rdf.to_csv("results.csv", index=False)
181 | ```
182 | ---
183 | **Detailed Result:**
184 | 
185 | | question                                      | contexts                                                                                                            | answer                                                                                                   | ground_truth                                                                                             | faithfulness | answer_relevancy | context_precision | context_recall |
186 | |-----------------------------------------------|---------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------|--------------|------------------|-------------------|----------------|
187 | | What is mixed precision in computing?         | [Examples GPT-2 text generation Parameter…]                                                                        | Mixed precision is a technique used to improve…                                                          | A combination of different numerical precision…                                                             | 0.166667     | 0.981859         | 0.0               | 0.666667       |
188 | | What is the title of the guide discussed in th... | [Available guides… Hyperparameter T…]                                                                              | The title of the guide discussed in the given…                                                           | How to distribute training                                                                                  | 0.000000     | 1.000000         | 0.0               | 1.000000       |
189 | | What is Keras 3?                              | [No relationships found.]                                                                                          | Keras 3 is a new version of the popular deep l…                                                          | A deep learning framework that works with Tensor…                                                            | 0.000000     | 0.974711         | 0.0               | 0.500000       |
190 | | What was the percentage boost in StableDiffusion... | [A first example: A MNIST convnet…]                                                                                | The percentage boost in StableDiffusion traini…                                                          | Over 150%                                                                                                    | 0.000000     | 0.970565         | 1.0               | 1.000000       |
191 | | What are some examples of pretrained models av... | [No relationships found.]                                                                                          | Some examples of pre-trained models available…                                                           | BERT, OPT, Whisper, T5, StableDiffusion, YOLOv8…                                                             | 0.000000     | 0.989769         | 0.0               | 0.857143       |
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/experiments/experiment_t5_abs_summarization/experiment_t5_abs_summarization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "gpuType": "T4"
  8 |     },
  9 |     "kernelspec": {
 10 |       "name": "python3",
 11 |       "display_name": "Python 3"
 12 |     },
 13 |     "language_info": {
 14 |       "name": "python"
 15 |     },
 16 |     "accelerator": "GPU"
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "code",
 21 |       "execution_count": 11,
 22 |       "metadata": {
 23 |         "colab": {
 24 |           "base_uri": "https://localhost:8080/"
 25 |         },
 26 |         "id": "a1J6mrksgO0w",
 27 |         "outputId": "fc730428-d183-4957-c81a-3c9876a248d3"
 28 |       },
 29 |       "outputs": [
 30 |         {
 31 |           "output_type": "stream",
 32 |           "name": "stdout",
 33 |           "text": [
 34 |             "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
 35 |             "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n",
 36 |             "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.29.2)\n",
 37 |             "Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (1.26.15)\n",
 38 |             "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.12.0)\n",
 39 |             "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n",
 40 |             "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n",
 41 |             "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n",
 42 |             "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n",
 43 |             "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n",
 44 |             "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n",
 45 |             "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.5)\n",
 46 |             "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.1)\n",
 47 |             "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n",
 48 |             "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n",
 49 |             "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n",
 50 |             "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n",
 51 |             "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n",
 52 |             "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n",
 53 |             "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n",
 54 |             "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.4.0)\n",
 55 |             "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2)\n",
 56 |             "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n",
 57 |             "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n",
 58 |             "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n",
 59 |             "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n"
 60 |           ]
 61 |         }
 62 |       ],
 63 |       "source": [
 64 |         "!pip install torch transformers urllib3"
 65 |       ]
 66 |     },
 67 |     {
 68 |       "cell_type": "code",
 69 |       "source": [
 70 |         "import torch\n",
 71 |         "from transformers import AutoTokenizer, AutoModelWithLMHead\n",
 72 |         "import urllib3"
 73 |       ],
 74 |       "metadata": {
 75 |         "id": "DTeMo878grDl"
 76 |       },
 77 |       "execution_count": 12,
 78 |       "outputs": []
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "source": [
 83 |         "tokenizer=AutoTokenizer.from_pretrained('T5-base')\n",
 84 |         "model=AutoModelWithLMHead.from_pretrained('T5-base', return_dict=True)"
 85 |       ],
 86 |       "metadata": {
 87 |         "colab": {
 88 |           "base_uri": "https://localhost:8080/"
 89 |         },
 90 |         "id": "sgRtK1S5gwy5",
 91 |         "outputId": "bc79d4b9-85c8-43f3-ce81-db56f0fda82e"
 92 |       },
 93 |       "execution_count": 13,
 94 |       "outputs": [
 95 |         {
 96 |           "output_type": "stream",
 97 |           "name": "stderr",
 98 |           "text": [
 99 |             "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/modeling_auto.py:1352: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n",
100 |             "  warnings.warn(\n"
101 |           ]
102 |         }
103 |       ]
104 |     },
105 |     {
106 |       "cell_type": "code",
107 |       "source": [
108 |         "git_repo = input('git_repo url : ')\n",
109 |         "git_repo = git_repo.replace(\"github.com\", \"raw.githubusercontent.com\")\n",
110 |         "git_repo = git_repo + \"/master/README.md\"\n",
111 |         "\n",
112 |         "http = urllib3.PoolManager()\n",
113 |         "r = http.request('GET', git_repo, preload_content=False)\n",
114 |         "prompt = str(r.read()) + \"\\nsummarize: \"\n",
115 |         "prompt = str(prompt)\n",
116 |         "r.release_conn()\n"
117 |       ],
118 |       "metadata": {
119 |         "colab": {
120 |           "base_uri": "https://localhost:8080/"
121 |         },
122 |         "id": "u5uVhkdtg1_H",
123 |         "outputId": "9e2bb214-8e05-4d12-8bff-41616ccd4e61"
124 |       },
125 |       "execution_count": 54,
126 |       "outputs": [
127 |         {
128 |           "name": "stdout",
129 |           "output_type": "stream",
130 |           "text": [
131 |             "git_repo url : https://github.com/kikinteractive/app\n"
132 |           ]
133 |         }
134 |       ]
135 |     },
136 |     {
137 |       "cell_type": "code",
138 |       "source": [
139 |         "inputs=tokenizer.encode(prompt, return_tensors='pt', max_length=1024, truncation=True)"
140 |       ],
141 |       "metadata": {
142 |         "id": "-0vT8Ychi2_9"
143 |       },
144 |       "execution_count": 55,
145 |       "outputs": []
146 |     },
147 |     {
148 |       "cell_type": "code",
149 |       "source": [
150 |         "output = model.generate(inputs, min_length=256, max_length=512)\n"
151 |       ],
152 |       "metadata": {
153 |         "id": "hTylPPZDhSMR"
154 |       },
155 |       "execution_count": 56,
156 |       "outputs": []
157 |     },
158 |     {
159 |       "cell_type": "code",
160 |       "source": [
161 |         "summary=tokenizer.decode(output[0], skip_special_tokens=True)\n",
162 |         "\n",
163 |         "print(\"README : \")\n",
164 |         "print(str(prompt))\n",
165 |         "\n",
166 |         "print(\"Summary : \")\n",
167 |         "print(summary)"
168 |       ],
169 |       "metadata": {
170 |         "colab": {
171 |           "base_uri": "https://localhost:8080/"
172 |         },
173 |         "id": "D83RWVvihU2e",
174 |         "outputId": "b7073088-8cde-4833-b3ee-21286c2b1781"
175 |       },
176 |       "execution_count": 57,
177 |       "outputs": [
178 |         {
179 |           "output_type": "stream",
180 |           "name": "stdout",
181 |           "text": [
182 |             "README : \n",
183 |             "b'app.js - mobile webapps made easy\\n=================================\\n\\nApp.js is a lightweight JavaScript UI library for creating mobile webapps that behave like native apps, sacrificing neither performance nor polish.\\n\\n* cross-platform (Android 2.2+, iOS 4.3+)\\n* themable platform-specific UI designs\\n* configurable native-like transitions\\n* automatically managed navigation stack\\n* built-in widgets for general use-cases\\n\\nThe goal of App.js is to provide a robust starting point for mobile webapps, handling general scenarios, and maintaining compatiblity with other common JavaScript libraries.\\n\\n\\n##[Check out the documentation here](http://code.kik.com/app/)\\n\\n\\n##[Changelog](/CHANGELOG.md)\\n'\n",
184 |             "summarize: \n",
185 |             "Summary : \n",
186 |             "b'app.js is a lightweight JavaScript UI library for creating mobile webapps that behave like native apps. it's designed to provide a robust starting point for mobile webapps, handling general scenarios, and maintaining compatiblity with other common JavaScript libraries. it's also a great way to create custom mobile webapps, without having to sacrifice performance or polish. b'app.js is available for ios and android 2.2+, and is                                                           .........................\n"
187 |           ]
188 |         }
189 |       ]
190 |     },
191 |     {
192 |       "cell_type": "code",
193 |       "source": [],
194 |       "metadata": {
195 |         "id": "ed7TqUT4hqF-"
196 |       },
197 |       "execution_count": 40,
198 |       "outputs": []
199 |     }
200 |   ]
201 | }


--------------------------------------------------------------------------------
/graph_rag/experiments/EXPERIMENTS.MD:
--------------------------------------------------------------------------------
 1 | # Experiments
 2 | 
 3 | The major portion of my time in the first phase of the GSoC project has been spent experimenting with different models, embeddings, and libraries.
 4 | 
 5 | ## Knowledge Graph from Documentation
 6 | 
 7 | The majority of the documentation for libraries is stored in the form of HTML and markdown files in their GitHub repositories.
 8 | 
 9 | We first used llama-index document loaders to load all documents with the .md extension. We then performed chunking and created a Document instance of them.
10 | 
11 | ## Knowledge Graph Using Code Embeddings
12 | 
13 | Implementation of the idea can be found here: [Colab](https://colab.research.google.com/drive/1uguR76SeMAukN4uAhKuXU_ja8Ik0s8Wj#scrollTo=CUgtX5D1Tl_x).
14 | 
15 | The idea is to separate code blocks or take code and split it using a code splitter, then pass it to a model for building a Knowledge Graph using code embeddings. I used:
16 | - Salesforce/codegen2-7B_P quantized (4-bit)
17 | - Salesforce/codet5p-110m-embedding
18 | - Python files in Keras-io
19 | 
20 | ### Model Selection
21 | 
22 | We need a model that is open source and can work on the free Colab version to begin with. For a better knowledge graph, we quantized models above 20GB to 4 bits using bitsandbytes configuration. We tried the following LLMs:
23 | - gemini pro
24 | - [QuantiPhy/zephyr-7b-beta(4bit-quantized)**](https://huggingface.co/QuantiPhy/zephyr-7b-beta-4bit-quantized)
25 | - llama3 (Ollama version)
26 | - codellama (Ollama version)
27 | - [QuantiPhy/aya-23-8B (4bit quantized)**](https://huggingface.co/QuantiPhy/aya-23-8B-4bq)
28 | - gpt-neo-2.7B(4bit-quantized)
29 | - [Salesforce/codegen2-7B_P(4bit-quantized)**](https://huggingface.co/QuantiPhy/Salesforce_codegen2-7B_P)
30 | - phi3 (Ollama)
31 | - phi3:medium (Ollama)
32 | - neural-chat (Ollama)
33 | - gemma2 (Ollama)
34 | - mistral (Ollama)   
35 | ** all these models,I have 4bit-quantized them using bitsandbytes
36 | ### Embeddings
37 | 
38 | For embeddings, we tried:
39 | - microsoft/codebert-base
40 | - Salesforce/codet5p-110m-embedding
41 | 
42 | ### Libraries
43 | 
44 | In the initial phase, we are looking for libraries in the community that solve the problem of building Knowledge Graphs:
45 | - [llama-index knowledge-graph builder](https://github.com/run-llama/llama_index/tree/main/llama-index-core/llama_index/core/indices/knowledge_graph)
46 | - [llm-graph-builder](https://github.com/neo4j-labs/llm-graph-builder)
47 | - [graph_builder](https://github.com/sarthakrastogi/graph-rag)
48 | 
49 | ### Table
50 | 
51 | | Model                       | Embeddings           | Libraries                  | Remarks     | Documents                | Artifacts                                                                                                                                                                                                                                                                            |
52 | |:----------------------------|:---------------------|:---------------------------|:------------|:-------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
53 | | gemma2 (Ollama)             | microsoft/codebert-base | llama-index graph builder | nil         | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/gemma2/Graph_visualization_gemma2_mscb.html)<br/>[index](artifacts/gemma2/gemma2graphIndex.pkl)<br/>[collab](https://colab.research.google.com/drive/1q7FED2Lapk3D7ibqkO3NkqZ6iPNZ_x6H?usp=sharing)                                                                  |
54 | | mistral (Ollama)            | microsoft/codebert-base | llama-index graph builder | nil         | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/mistral/Graph_visualization_mistral_mscb.html)<br/>[index](artifacts/mistral/mistralgraphIndex.pkl)<br/>[collab](https://colab.research.google.com/drive/1q7FED2Lapk3D7ibqkO3NkqZ6iPNZ_x6H?usp=sharing)                                                              |
55 | | neural-chat (Ollama)        | microsoft/codebert-base | llama-index graph builder | nil         | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/neural_chat/Graph_visualization_neuralchat_mscb.html)<br/>[index](artifacts/neural_chat/graphIndex_neuralchat_mscb.pkl)<br/>[collab](https://colab.research.google.com/drive/1cM6ujhiKM1v0bRYVN9F9UEgjYlwkBTt9?usp=sharing)                                          |
56 | | phi3:medium (Ollama)        | microsoft/codebert-base | llama-index graph builder | nil         | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/phi3-med/Graph_visualization_phi3-med_mscb.html)<br/>[index](artifacts/phi3-med/graphIndex_phi3_medium_mscb.pkl)<br/>[collab](https://colab.research.google.com/drive/1cM6ujhiKM1v0bRYVN9F9UEgjYlwkBTt9?usp=sharing)                                                 |
57 | | phi3 (Ollama)               | microsoft/codebert-base | llama-index graph builder | nil         | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/phi3/Graph_visualization_phi3_mscb.html)<br/>[index](artifacts/phi3/graphIndex_phi3_mscb.pkl)<br/>[collab](https://colab.research.google.com/drive/1cM6ujhiKM1v0bRYVN9F9UEgjYlwkBTt9?usp=sharing)                                                                    |
58 | | gpt-4o                      | open-ai              | Neo4jGraphBuilder          | nil         | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/vizualization/visualisation.png)                                                                                                                                                                                                                                     |
59 | | Gemini                      | gemini               | llama-index graph builder | nil         | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/ex1.html)                                                                                                                                                                                                                                              |
60 | | Gemini                      | gemini               | llama-index graph builder | Rate-error  | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) |                                                                                                                                                                                                                                                                                      |
61 | | Gemini                      | microsoft/codebert-base | llama-index graph builder | nil         | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/gem_mcode_k_nlp.html)                                                                                                                                                                                                                                  |
62 | | Zypher (4-bit)              | microsoft/codebert-base | llama-index graph builder | nil         | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/zy_knlp.html)                                                                                                                                                                                                                                          |
63 | | Zypher (4-bit)              | microsoft/codebert-base | llama-index graph builder | nil         | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/vizualization/examp.html)                                                                                                                                                                                                                                            |
64 | | llama3 (Ollama version)     | microsoft/codebert-base | llama-index graph builder | nil         | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/Graph_visualization.html)                                                                                                                                                                                                                              |
65 | | codellama (Ollama version)  | microsoft/codebert-base | llama-index graph builder | nil         | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/code_1.html)                                                                                                                                                                                                                                           |
66 | | gpt-neo-2.7B-4bit-quantized | microsoft/codebert-base | llama-index graph builder | nil         | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/graph_gpt3-neo.html)                                                                                                                                                                                                                                   |
67 | 
68 | ### Notes
69 | - ### [graph_builder](https://github.com/sarthakrastogi/graph-rag)   
70 | 
71 |   -  I explored graph_rag by Sarthak. It is fundamentally based on function calling (JSON output), and it works very well for powerful models. However, small-sized LLMs tend to make mistakes regardless of how well the prompt is crafted.
72 |   - I tried and debugged the library, and this was my experience with it. I modified the system prompts, which led to fewer mistakes, and added a method to download .html files for visualization. Additionally, I added methods to use Ollama OS models.
73 |   - [rough_codes](https://colab.research.google.com/drive/1q6T8mK-O2XKqY-iGFz6xdrzvqLzu73lm#scrollTo=H0QG6QUVub8T) contains codes/modification/implementation for the rep0


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <img alt="project explainer" src="static/logos/logo.svg" width=300>
  3 | 
  4 | [Project Explainer (as module)](#project-explainer-as-module) •
  5 | [Project Explainer (as ui)](#project-explainer-as-ui) •
  6 | [Project Repository Utilities (gh_processor py module)](#project-repository-utilities-gh_processor-py-module)
  7 | </div>
  8 | 
  9 | <br/>
 10 | 
 11 | Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.
 12 | 
 13 | 
 14 | ## Tools
 15 | 
 16 | ### Project Explainer (as module)
 17 | 
 18 | A python module that is capable of providing different levels of summary for the give github repo using transformer models
 19 | 
 20 | #### Installation
 21 | 
 22 | ```
 23 | pip install git+https://github.com/c2siorg/Project-Explainer.git@main#subdirectory=project_explainer&egg=gh_explainer
 24 | ```
 25 | 
 26 | #### Example usage
 27 | 
 28 | ```python
 29 | from project_explainer import Explainer
 30 | 
 31 | gptExplainer = Explainer("gpt2")
 32 | 
 33 | print(gptExplainer.brief("https://github.com/c2siorg/Project-Explainer.git"))
 34 | ```
 35 | 
 36 | #### Output
 37 | 
 38 | ```
 39 | {'prompt': {'prompt': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.'}, 'prepared_prompt': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.\nExplain the above : ', 'summary': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.\nExplain the above : \xa0The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model.'}
 40 | ```
 41 | 
 42 | ### Project Explainer (as ui)
 43 | 
 44 | Use project explainer as UI
 45 | 
 46 | #### Dependencies
 47 | 
 48 | ```
 49 | pip install -r project_explainer_ui/requirements.txt
 50 | ```
 51 | 
 52 | #### Example usage
 53 | 
 54 | ```
 55 | python project_explainer_ui/ui.py
 56 | ```
 57 | 
 58 | ![](static/ui.png)
 59 | 
 60 | 
 61 | ### Project Repository Utilities (gh_processor py module)
 62 | 
 63 | A simple python module packed with utilities to process files in a project repository such as git repositories.
 64 | 
 65 | #### Installation
 66 | 
 67 | ```
 68 | pip install git+https://github.com/c2siorg/Project-Explainer.git@main#subdirectory=project_processor&egg=gh_processor
 69 | ```
 70 | 
 71 | #### Example usage
 72 | 
 73 | ```python
 74 | from gh_processor import download_github_repo, extract_headings_with_paragraphs_from_markdown, get_files_by_extension
 75 | 
 76 | git_url = "https://github.com/c2siorg/Project-Explainer.git"
 77 | 
 78 | repo_path = download_github_repo(git_url)
 79 | 
 80 | print(repo_path)
 81 | 
 82 | markdown_files = get_files_by_extension(repo_path, [".md"])
 83 | 
 84 | headings_with_content = {}
 85 | 
 86 | print(markdown_files)
 87 | 
 88 | for markdown_file in markdown_files:
 89 |     print(markdown_file)
 90 |     headings_with_content[markdown_file] = extract_headings_with_paragraphs_from_markdown(markdown_file)
 91 | 
 92 | print(headings_with_content)
 93 | ```
 94 | 
 95 | Output
 96 | 
 97 | ```
 98 | {'/Users/sripravallika/Project-Explainer/Project-Explainer/README.md': {'Project-Explainer': 'Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.'}}
 99 | ```
100 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/experiments/experiment_bart_ft_abs_summarization/experiment_bart_ft_abs_summarization_eval.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "gpuType": "T4"
  8 |     },
  9 |     "kernelspec": {
 10 |       "name": "python3",
 11 |       "display_name": "Python 3"
 12 |     },
 13 |     "language_info": {
 14 |       "name": "python"
 15 |     },
 16 |     "accelerator": "GPU"
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "code",
 21 |       "source": [
 22 |         "# import locale\n",
 23 |         "# locale.getpreferredencoding = lambda: \"UTF-8\""
 24 |       ],
 25 |       "metadata": {
 26 |         "id": "60IU1eNJyJ4G"
 27 |       },
 28 |       "execution_count": 1,
 29 |       "outputs": []
 30 |     },
 31 |     {
 32 |       "cell_type": "code",
 33 |       "source": [
 34 |         "!pip install transformers datasets evaluate rouge_score sentencepiece"
 35 |       ],
 36 |       "metadata": {
 37 |         "colab": {
 38 |           "base_uri": "https://localhost:8080/"
 39 |         },
 40 |         "id": "HejdwoIRyD50",
 41 |         "outputId": "f8788dd3-0c37-40cf-c421-63dfc4b33fd7"
 42 |       },
 43 |       "execution_count": 2,
 44 |       "outputs": [
 45 |         {
 46 |           "output_type": "stream",
 47 |           "name": "stdout",
 48 |           "text": [
 49 |             "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.32.0)\n",
 50 |             "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.14.4)\n",
 51 |             "Requirement already satisfied: evaluate in /usr/local/lib/python3.10/dist-packages (0.4.0)\n",
 52 |             "Requirement already satisfied: rouge_score in /usr/local/lib/python3.10/dist-packages (0.1.2)\n",
 53 |             "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.1.99)\n",
 54 |             "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n",
 55 |             "Requirement already satisfied: huggingface-hub<1.0,>=0.15.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.16.4)\n",
 56 |             "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n",
 57 |             "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n",
 58 |             "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n",
 59 |             "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n",
 60 |             "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
 61 |             "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n",
 62 |             "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.3)\n",
 63 |             "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n",
 64 |             "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n",
 65 |             "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.7)\n",
 66 |             "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n",
 67 |             "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.3.0)\n",
 68 |             "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.15)\n",
 69 |             "Requirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n",
 70 |             "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.8.5)\n",
 71 |             "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.10/dist-packages (from evaluate) (0.18.0)\n",
 72 |             "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge_score) (1.4.0)\n",
 73 |             "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge_score) (3.8.1)\n",
 74 |             "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge_score) (1.16.0)\n",
 75 |             "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n",
 76 |             "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (3.2.0)\n",
 77 |             "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n",
 78 |             "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
 79 |             "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.2)\n",
 80 |             "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.0)\n",
 81 |             "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
 82 |             "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.15.1->transformers) (4.7.1)\n",
 83 |             "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n",
 84 |             "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.4)\n",
 85 |             "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n",
 86 |             "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge_score) (8.1.7)\n",
 87 |             "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->rouge_score) (1.3.2)\n",
 88 |             "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
 89 |             "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.3)\n"
 90 |           ]
 91 |         }
 92 |       ]
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "source": [
 97 |         "from transformers import pipeline\n"
 98 |       ],
 99 |       "metadata": {
100 |         "id": "DTeMo878grDl"
101 |       },
102 |       "execution_count": 3,
103 |       "outputs": []
104 |     },
105 |     {
106 |       "cell_type": "code",
107 |       "source": [
108 |         "from datasets import load_dataset\n",
109 |         "\n",
110 |         "billsum = load_dataset(\"billsum\", split=\"ca_test\")\n",
111 |         "# xsum = load_dataset(\"xsum\", split=\"test\")"
112 |       ],
113 |       "metadata": {
114 |         "id": "ed7TqUT4hqF-"
115 |       },
116 |       "execution_count": 4,
117 |       "outputs": []
118 |     },
119 |     {
120 |       "cell_type": "code",
121 |       "source": [
122 |         "# xsum"
123 |       ],
124 |       "metadata": {
125 |         "id": "bWy_szkjrMhB"
126 |       },
127 |       "execution_count": 5,
128 |       "outputs": []
129 |     },
130 |     {
131 |       "cell_type": "code",
132 |       "source": [
133 |         "billsum"
134 |       ],
135 |       "metadata": {
136 |         "colab": {
137 |           "base_uri": "https://localhost:8080/"
138 |         },
139 |         "id": "9Iv45JnCsGqt",
140 |         "outputId": "2cccf711-cd86-4c53-ad91-ee3c105b0206"
141 |       },
142 |       "execution_count": 6,
143 |       "outputs": [
144 |         {
145 |           "output_type": "execute_result",
146 |           "data": {
147 |             "text/plain": [
148 |               "Dataset({\n",
149 |               "    features: ['text', 'summary', 'title'],\n",
150 |               "    num_rows: 1237\n",
151 |               "})"
152 |             ]
153 |           },
154 |           "metadata": {},
155 |           "execution_count": 6
156 |         }
157 |       ]
158 |     },
159 |     {
160 |       "cell_type": "code",
161 |       "source": [
162 |         "from tqdm import tqdm"
163 |       ],
164 |       "metadata": {
165 |         "id": "i-eSVEM9t9R2"
166 |       },
167 |       "execution_count": 7,
168 |       "outputs": []
169 |     },
170 |     {
171 |       "cell_type": "code",
172 |       "source": [
173 |         "# xsum_pred = []\n",
174 |         "# for doc in tqdm(xsum[\"document\"], total=len(xsum[\"document\"])):\n",
175 |         "#   prompt = doc + \"\\nsummarize:\"\n",
176 |         "#   inputs = tokenizer.encode(prompt, return_tensors='pt', max_length=1024, truncation=True).to(\"cuda\")\n",
177 |         "#   output = model.generate(inputs)\n",
178 |         "#   xsum_pred.append(tokenizer.decode(output[0], skip_special_tokens=True))\n"
179 |       ],
180 |       "metadata": {
181 |         "id": "EoFqVBEksKW6"
182 |       },
183 |       "execution_count": 8,
184 |       "outputs": []
185 |     },
186 |     {
187 |       "cell_type": "code",
188 |       "source": [
189 |         "summarizer = pipeline(\"summarization\", model=\"knkarthick/MEETING_SUMMARY\", device=\"cuda:0\")\n"
190 |       ],
191 |       "metadata": {
192 |         "id": "_tHpEqZV1EQY"
193 |       },
194 |       "execution_count": null,
195 |       "outputs": []
196 |     },
197 |     {
198 |       "cell_type": "code",
199 |       "source": [
200 |         "tokenizer_kwargs = {'truncation':True,'max_length':512,'return_tensors':'pt'}"
201 |       ],
202 |       "metadata": {
203 |         "id": "-FQLtsSr2-jX"
204 |       },
205 |       "execution_count": null,
206 |       "outputs": []
207 |     },
208 |     {
209 |       "cell_type": "code",
210 |       "source": [
211 |         "billsum_pred = []\n",
212 |         "for doc in tqdm(billsum[\"text\"], total=len(billsum[\"text\"])):\n",
213 |         "\n",
214 |         "  output = summarizer(doc, **tokenizer_kwargs)\n",
215 |         "  billsum_pred.append(output[0][\"summary_text\"])"
216 |       ],
217 |       "metadata": {
218 |         "id": "b9AKooOasS_t"
219 |       },
220 |       "execution_count": null,
221 |       "outputs": []
222 |     },
223 |     {
224 |       "cell_type": "code",
225 |       "source": [
226 |         "import evaluate"
227 |       ],
228 |       "metadata": {
229 |         "id": "sdn5p-2wwd8w"
230 |       },
231 |       "execution_count": null,
232 |       "outputs": []
233 |     },
234 |     {
235 |       "cell_type": "code",
236 |       "source": [
237 |         "bleu = evaluate.load(\"bleu\")"
238 |       ],
239 |       "metadata": {
240 |         "id": "NHkg44iKuWhf"
241 |       },
242 |       "execution_count": null,
243 |       "outputs": []
244 |     },
245 |     {
246 |       "cell_type": "code",
247 |       "source": [
248 |         "print(billsum[\"summary\"])"
249 |       ],
250 |       "metadata": {
251 |         "id": "59KjrRhTwtqY"
252 |       },
253 |       "execution_count": null,
254 |       "outputs": []
255 |     },
256 |     {
257 |       "cell_type": "code",
258 |       "source": [
259 |         "results = bleu.compute(predictions=billsum_pred, references=billsum[\"summary\"])"
260 |       ],
261 |       "metadata": {
262 |         "id": "g4LYYPthwcgs"
263 |       },
264 |       "execution_count": null,
265 |       "outputs": []
266 |     },
267 |     {
268 |       "cell_type": "code",
269 |       "source": [
270 |         "results"
271 |       ],
272 |       "metadata": {
273 |         "id": "pO8xelg4wqRK"
274 |       },
275 |       "execution_count": null,
276 |       "outputs": []
277 |     },
278 |     {
279 |       "cell_type": "code",
280 |       "source": [
281 |         "rouge = evaluate.load(\"rouge\")"
282 |       ],
283 |       "metadata": {
284 |         "id": "73RNJhvIw3aJ"
285 |       },
286 |       "execution_count": null,
287 |       "outputs": []
288 |     },
289 |     {
290 |       "cell_type": "code",
291 |       "source": [
292 |         "rouge_results = rouge.compute(predictions=billsum_pred, references=billsum[\"summary\"])"
293 |       ],
294 |       "metadata": {
295 |         "id": "HlOybJAFxDI2"
296 |       },
297 |       "execution_count": null,
298 |       "outputs": []
299 |     },
300 |     {
301 |       "cell_type": "code",
302 |       "source": [
303 |         "rouge_results"
304 |       ],
305 |       "metadata": {
306 |         "id": "xNgumPMmxK2u"
307 |       },
308 |       "execution_count": null,
309 |       "outputs": []
310 |     },
311 |     {
312 |       "cell_type": "code",
313 |       "source": [],
314 |       "metadata": {
315 |         "id": "Rx6smpcCxMgk"
316 |       },
317 |       "execution_count": null,
318 |       "outputs": []
319 |     }
320 |   ]
321 | }


--------------------------------------------------------------------------------
/project_processor/gh_processor/file_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import List, Dict
  3 | import re
  4 | import markdown2
  5 | import spacy
  6 | 
  7 | 
  8 | def get_files_by_extension(directory: str, extensions: List[str]) -> List[str]:
  9 |     """
 10 |     Retrieve the paths of files in the given directory that match the specified file extensions.
 11 | 
 12 |     Args:
 13 |         directory (str): The directory path to search for files.
 14 |         extensions (list): A list of file extensions to match.
 15 | 
 16 |     Returns:
 17 |         list: A list of file paths that match the given extensions.
 18 |     """
 19 |     file_paths = []
 20 | 
 21 |     for root, dirs, files in os.walk(directory):
 22 |         for file in files:
 23 |             file_extension = os.path.splitext(file)[1]
 24 |             if file_extension in extensions:
 25 |                 file_path = os.path.join(root, file)
 26 |                 file_paths.append(file_path)
 27 | 
 28 |     return file_paths
 29 | 
 30 | 
 31 | def extract_code_blocks_from_markdown(file_path: str) -> List[str]:
 32 |     """
 33 |     Extract code blocks from a Markdown file.
 34 | 
 35 |     Args:
 36 |         file_path (str): The path of the Markdown file.
 37 | 
 38 |     Returns:
 39 |         List[str]: A list of code blocks extracted from the Markdown file.
 40 |     """
 41 |     if not file_path.endswith(".md"):
 42 |         raise ValueError("The provided file is not a Markdown file.")
 43 | 
 44 |     code_blocks = []
 45 | 
 46 |     with open(file_path, "r") as file:
 47 |         lines = file.readlines()
 48 |         code_block = []
 49 |         in_code_block = False
 50 | 
 51 |         for line in lines:
 52 |             if line.startswith("```"):
 53 |                 if in_code_block:
 54 |                     code_blocks.append("".join(code_block))
 55 |                     code_block = []
 56 |                     in_code_block = False
 57 |                 else:
 58 |                     in_code_block = True
 59 |             elif in_code_block:
 60 |                 code_block.append(line)
 61 | 
 62 |     return code_blocks
 63 | 
 64 | 
 65 | def extract_links_from_markdown(file_path: str) -> List[str]:
 66 |     """
 67 |     Extract links from a Markdown file.
 68 | 
 69 |     Args:
 70 |         file_path (str): The path of the Markdown file.
 71 | 
 72 |     Returns:
 73 |         List[str]: A list of links extracted from the Markdown file.
 74 |     """
 75 |     if not file_path.endswith(".md"):
 76 |         raise ValueError("The provided file is not a Markdown file.")
 77 | 
 78 |     links = []
 79 | 
 80 |     with open(file_path, "r") as file:
 81 |         content = file.read()
 82 |         link_pattern = r"\[(.*?)\]\((.*?)\)"
 83 |         matches = re.findall(link_pattern, content)
 84 | 
 85 |         for match in matches:
 86 |             link_text, link_url = match
 87 |             links.append(link_url)
 88 | 
 89 |     return links
 90 | 
 91 | 
 92 | def extract_images_from_markdown(file_path: str) -> List[str]:
 93 |     """
 94 |     Extract image URLs from a Markdown file.
 95 | 
 96 |     Args:
 97 |         file_path (str): The path of the Markdown file.
 98 | 
 99 |     Returns:
100 |         List[str]: A list of image URLs extracted from the Markdown file.
101 |     """
102 |     if not file_path.endswith(".md"):
103 |         raise ValueError("The provided file is not a Markdown file.")
104 | 
105 |     images = []
106 | 
107 |     with open(file_path, "r") as file:
108 |         content = file.read()
109 |         image_pattern = r"!\[(.*?)\]\((.*?)\)"
110 |         matches = re.findall(image_pattern, content)
111 | 
112 |         for match in matches:
113 |             alt_text, image_url = match
114 |             images.append(image_url)
115 | 
116 |     return images
117 | 
118 | 
119 | def extract_headings_with_paragraphs_from_markdown(file_path: str) -> dict:
120 |     """
121 |     Extract headings and the paragraph text below each heading from a Markdown file.
122 | 
123 |     Args:
124 |         file_path (str): The path of the Markdown file.
125 | 
126 |     Returns:
127 |         dict: A dictionary where the keys are the headings and the values are the corresponding paragraphs.
128 |     """
129 |     if not file_path.endswith(".md"):
130 |         raise ValueError("The provided file is not a Markdown file.")
131 | 
132 |     heading_paragraphs = {}
133 | 
134 |     with open(file_path, "r") as file:
135 |         content = file.read()
136 |         heading_pattern = r"#+\s(.+)"
137 |         matches = re.findall(heading_pattern, content)
138 | 
139 |         for match in matches:
140 |             heading = match
141 |             next_line_index = content.index(match) + len(match) + 1
142 |             next_line = content[next_line_index:].strip()
143 | 
144 |             if next_line.startswith("#"):
145 |                 paragraph = ""
146 |             else:
147 |                 paragraph = next_line
148 | 
149 |             heading_paragraphs[heading] = paragraph
150 | 
151 |     return heading_paragraphs
152 | 
153 | 
154 | def extract_tables_from_markdown(file_path: str) -> List[List[str]]:
155 |     """
156 |     Extract tables from a Markdown file.
157 | 
158 |     Args:
159 |         file_path (str): The path of the Markdown file.
160 | 
161 |     Returns:
162 |         List[List[str]]: A list of tables extracted from the Markdown file.
163 |     """
164 |     if not file_path.endswith(".md"):
165 |         raise ValueError("The provided file is not a Markdown file.")
166 | 
167 |     tables = []
168 | 
169 |     with open(file_path, "r") as file:
170 |         content = file.read()
171 |         table_pattern = r"\|(.+)\|(\n\|.+)+\n?"
172 |         matches = re.findall(table_pattern, content)
173 | 
174 |         for match in matches:
175 |             table_lines = match[0].split("\n|")[1:]
176 |             table = [line.strip().split("|") for line in table_lines]
177 |             tables.append(table)
178 | 
179 |     return tables
180 | 
181 | 
182 | def extract_project_description_from_readme(file_path: str) -> str:
183 |     """
184 |     Extract the project description from a README.md file.
185 | 
186 |     Args:
187 |         file_path (str): The path of the README.md file.
188 | 
189 |     Returns:
190 |         str: The project description extracted from the README.md file.
191 |     """
192 |     if not file_path.endswith(".md"):
193 |         raise ValueError("The provided file is not a .md file.")
194 | 
195 |     with open(file_path, "r") as file:
196 |         lines = file.readlines()
197 |         description = ""
198 |         in_description = False
199 | 
200 |         for line in lines:
201 |             line = line.strip()
202 | 
203 |             if not line:
204 |                 continue
205 | 
206 |             if not in_description:
207 |                 if line.lower().startswith("#"):
208 |                     in_description = True
209 |                     description += line.lstrip("#").strip() + " "
210 |             else:
211 |                 if line.lower().startswith("#"):
212 |                     break
213 |                 else:
214 |                     description += line + " "
215 | 
216 |     return description.strip()
217 | 
218 | 
219 | def convert_markdown_to_html(markdown_text: str) -> str:
220 |     """
221 |     Convert Markdown text to html.
222 | 
223 |     Args:
224 |         markdown_text (str): The Markdown text to be converted.
225 | 
226 |     Returns:
227 |         str: The html equivalent text of the Markdown text.
228 |     """
229 |     plain_text = markdown2.markdown(
230 |         markdown_text, extras=["tables", "fenced-code-blocks"])
231 |     return plain_text
232 | 
233 | 
234 | def convert_markdown_file_to_html(file_path: str) -> str:
235 |     """
236 |     Convert Markdown file to html.
237 | 
238 |     Args:
239 |         file_path (str): The path to the Markdown file.
240 | 
241 |     Returns:
242 |         str: The html equivalent content of the Markdown file.
243 | 
244 |     Raises:
245 |         ValueError: If the file is not a Markdown file.
246 |     """
247 |     if not file_path.lower().endswith('.md'):
248 |         raise ValueError("The file is not a Markdown file.")
249 | 
250 |     with open(file_path, 'r') as file:
251 |         markdown_text = file.read()
252 | 
253 |     html_content = convert_markdown_to_html(markdown_text)
254 | 
255 |     return html_content
256 | 
257 | 
258 | def check_phrase_similarity_using_spacyweb(phrase1: str, phrase2: str, threshold: float = 0.5) -> bool:
259 |     """
260 |     Checks the similarity between two phrases using spaCy's pre-trained word vectors.
261 | 
262 |     Args:
263 |         phrase1 (str): The first phrase.
264 |         phrase2 (str): The second phrase.
265 |         threshold (float): The threshold similarity score.
266 | 
267 |     Returns:
268 |         bool: True if the similarity score is above the threshold, False otherwise.
269 |     """
270 |     # python -m spacy download en_core_web_lg
271 |     nlp = spacy.load("en_core_web_lg")
272 | 
273 |     doc1 = nlp(phrase1)
274 |     doc2 = nlp(phrase2)
275 | 
276 |     similarity_score = doc1.similarity(doc2)
277 | 
278 |     return similarity_score >= threshold
279 | 
280 | 
281 | def check_similarity(text1: str, text2: str, strategy: str = "in") -> bool:
282 |     """
283 |     Checks the similarity between two texts using different strategies.
284 | 
285 |     Args:
286 |         text1 (str): The first text.
287 |         text2 (str): The second text.
288 |         strategy (str, optional): The strategy to use for similarity check.
289 |             Valid options are:
290 |             - "in": Checks if one text is contained within the other.
291 |             - "spacy_web": Checks similarity using spaCy's pre-trained word vectors.
292 | 
293 |     Returns:
294 |         bool: True if the texts are similar based on the chosen strategy, False otherwise.
295 |     """
296 |     if strategy == "in":
297 |         return (text1 in text2) or (text2 in text1)
298 |     elif strategy == "spacy_web":
299 |         return check_phrase_similarity_using_spacyweb(text1, text2, 0.5)
300 | 
301 | 
302 | def remove_sections_from_markdown(markdown_content: List[str], headings: List[str], strategy: str = "in") -> List[str]:
303 |     """
304 |     Removes sections from Markdown content based on a heading and similarity strategy.
305 | 
306 |     Args:
307 |         markdown_content (List[str]): The list of lines in the Markdown content.
308 |         headings (List[str]): List of headings to search for and remove along with its sections.
309 |         strategy (str, optional): The strategy to use for similarity check. Valid options are:
310 |             - "in": Checks if the heading is contained within the line.
311 |             - "spacy_web": Checks similarity using spaCy's pre-trained word vectors.
312 | 
313 |     Returns:
314 |         List[str]: The updated Markdown content with the specified sections removed.
315 |     """
316 |     updated_content = []
317 |     skip_section = False
318 | 
319 |     for line in markdown_content:
320 |         for heading in headings:
321 |             if check_similarity(heading, line, strategy):
322 |                 skip_section = True
323 |                 break
324 |         if not skip_section and line.startswith('#'):
325 |             skip_section = False
326 | 
327 |         if not skip_section:
328 |             updated_content.append(line)
329 | 
330 |     return updated_content
331 | 
332 | 
333 | def remove_headings_from_markdown_file(file_path: str, heading: str) -> List[str]:
334 |     """
335 |     Removes the specified heading and all the subsequent subheadings and paragraphs from the markdown file.
336 | 
337 |     Args:
338 |         file_path (str): The path to the markdown file.
339 |         heading (str): The heading to be removed along with its subsequent sections.
340 | 
341 |     Returns:
342 |         List[str]: The updated markdown content with the specified heading and its subsequent sections removed.
343 | 
344 |     Raises:
345 |         ValueError: If the file is not a Markdown file.
346 |     """
347 |     if not file_path.lower().endswith('.md'):
348 |         raise ValueError("The file is not a Markdown file.")
349 | 
350 |     with open(file_path, 'r') as file:
351 |         markdown_content = file.readlines()
352 | 
353 |     updated_content = remove_sections_from_markdown(markdown_content, heading)
354 | 
355 |     return updated_content
356 | 
357 | 
358 | def get_elements_from_markdown_file(file_path: str, elements: List[str]) -> Dict[str, str]:
359 |     """
360 |     Extracts specific elements from a Markdown file.
361 | 
362 |     Args:
363 |         file_path (str): The path to the Markdown file.
364 |         elements (List[str]): A list of elements to extract. Valid options are:
365 |             - "links": Extracts links from the Markdown file.
366 |             - "images": Extracts images from the Markdown file.
367 |             - "headings": Extracts headings with their corresponding paragraphs from the Markdown file.
368 |             - "code": Extracts code blocks from the Markdown file.
369 |             - "tables": Extracts tables from the Markdown file.
370 |             - "description": Extracts the project description from a README file.
371 | 
372 |     Returns:
373 |         Dict[str, str]: A dictionary containing the extracted elements as key-value pairs.
374 |             The keys correspond to the requested elements, and the values contain the extracted content.
375 | 
376 |     Raises:
377 |         ValueError: If the file is not a Markdown file.
378 |     """
379 |     if not file_path.lower().endswith('.md'):
380 |         raise ValueError("The file is not a Markdown file.")
381 | 
382 |     elements_to_extract = {
383 |         "links": extract_links_from_markdown,
384 |         "images": extract_images_from_markdown,
385 |         "headings": extract_headings_with_paragraphs_from_markdown,
386 |         "code": extract_code_blocks_from_markdown,
387 |         "tables": extract_tables_from_markdown,
388 |         "description": extract_project_description_from_readme
389 |     }
390 | 
391 |     result = {}
392 | 
393 |     for element in elements:
394 |         if element not in elements_to_extract.keys():
395 |             continue
396 |         result[element] = elements_to_extract.get(element)(file_path)
397 | 
398 |     return result
399 | 
400 | 
401 | def remove_images_from_markdown(file_path: str) -> str:
402 |     """
403 |     Removes image tags from a Markdown file and returns the updated content without images.
404 | 
405 |     Args:
406 |         file_path: The path to the Markdown file.
407 | 
408 |     Returns:
409 |         The Markdown content without images.
410 | 
411 |     Raises:
412 |         ValueError: If the provided file is not a Markdown file or if the file does not exist.
413 |     """
414 | 
415 |     if not file_path.lower().endswith('.md'):
416 |         raise ValueError(
417 |             "Invalid file. Only Markdown files (.md) are supported.")
418 | 
419 |     if not os.path.isfile(file_path):
420 |         raise ValueError("File not found.")
421 | 
422 |     with open(file_path, 'r') as f:
423 |         markdown_content = f.read()
424 | 
425 |     markdown_content_without_images = re.sub(
426 |         '!\[.*?\]\(.*?\)', '', markdown_content)
427 | 
428 |     return markdown_content_without_images
429 | 
430 | 
431 | def remove_links_from_markdown(file_path: str) -> str:
432 |     """
433 |     Removes link tags from a Markdown file and returns the updated content.
434 | 
435 |     Args:
436 |         file_path: The path to the Markdown file.
437 | 
438 |     Returns:
439 |         The Markdown content without links.
440 | 
441 |     Raises:
442 |         ValueError: If the provided file is not a Markdown file or if the file does not exist.
443 |     """
444 | 
445 |     if not file_path.lower().endswith('.md'):
446 |         raise ValueError(
447 |             "Invalid file. Only Markdown files (.md) are supported.")
448 | 
449 |     if not os.path.isfile(file_path):
450 |         raise ValueError("File not found.")
451 | 
452 |     with open(file_path, 'r') as f:
453 |         markdown_content = f.read()
454 | 
455 |     markdown_content_without_links = re.sub(
456 |         '\[.*?\]\(.*?\)', '', markdown_content)
457 | 
458 |     return markdown_content_without_links
459 | 
460 | 
461 | def remove_code_blocks_from_markdown(file_path: str) -> str:
462 |     """
463 |     Removes code blocks from a Markdown file and returns the updated content.
464 | 
465 |     Args:
466 |         file_path: The path to the Markdown file.
467 | 
468 |     Returns:
469 |         The Markdown content without code blocks.
470 | 
471 |     Raises:
472 |         ValueError: If the provided file is not a Markdown file or if the file does not exist.
473 |     """
474 | 
475 |     if not file_path.lower().endswith('.md'):
476 |         raise ValueError(
477 |             "Invalid file. Only Markdown files (.md) are supported.")
478 | 
479 |     if not os.path.isfile(file_path):
480 |         raise ValueError("File not found.")
481 | 
482 |     with open(file_path, 'r') as f:
483 |         markdown_content = f.read()
484 | 
485 |     markdown_content_without_code_blocks = re.sub(
486 |         '```[\s\S]*?```', '', markdown_content)
487 | 
488 |     return markdown_content_without_code_blocks
489 | 
490 | 
491 | def remove_tables_from_markdown(file_path: str) -> str:
492 |     """
493 |     Removes tables from a Markdown file and returns the updated content.
494 | 
495 |     Args:
496 |         file_path: The path to the Markdown file.
497 | 
498 |     Returns:
499 |         The Markdown content without tables.
500 | 
501 |     Raises:
502 |         ValueError: If the provided file is not a Markdown file or if the file does not exist.
503 |     """
504 | 
505 |     if not file_path.lower().endswith('.md'):
506 |         raise ValueError(
507 |             "Invalid file. Only Markdown files (.md) are supported.")
508 | 
509 |     if not os.path.isfile(file_path):
510 |         raise ValueError("File not found.")
511 | 
512 |     with open(file_path, 'r') as f:
513 |         markdown_content = f.read()
514 | 
515 |     markdown_content_without_tables = re.sub(
516 |         r'\n\|.*\|\n\|.*\|\n(\|.*\|)+', '', markdown_content)
517 | 
518 |     return markdown_content_without_tables
519 | 


--------------------------------------------------------------------------------