├── .flake8 ├── project_explainer ├── gh_explainer │ ├── __init__.py │ └── summarize.py ├── .gitignore ├── pyproject.toml └── examples │ └── examples.py ├── static ├── ui.png └── logos │ ├── logo.png │ └── logo.svg ├── graph_rag ├── evaluation │ ├── random │ │ ├── dataset_200_llama3.pkl │ │ ├── keras_docs_embedded.pkl │ │ └── results_5.csv │ ├── evaluation_llama_index.py │ ├── ragas_evaluation │ │ ├── prompts.py │ │ ├── evaluation_ragas.py │ │ └── QA_graphrag_testdataset.py │ └── README.MD ├── graph_builder │ ├── Example │ │ ├── random │ │ │ └── visualisation.png │ │ └── build_with_relic.MD │ ├── requirements.txt │ ├── main.py │ ├── README.MD │ ├── knowledgeGraph.py │ └── tools.py ├── experiments │ ├── artifacts │ │ ├── data_keras │ │ │ ├── index4.md │ │ │ ├── index5.md │ │ │ ├── index3.md │ │ │ ├── index1.md │ │ │ └── index2.md │ │ ├── gemma2 │ │ │ └── gemma2graphIndex.pkl │ │ ├── mistral │ │ │ └── mistralgraphIndex.pkl │ │ ├── phi3 │ │ │ └── graphIndex_phi3_mscb.pkl │ │ ├── vizualization │ │ │ └── visualisation.png │ │ ├── phi3-med │ │ │ └── graphIndex_phi3_medium_mscb.pkl │ │ └── neural_chat │ │ │ └── graphIndex_neuralchat_mscb.pkl │ └── EXPERIMENTS.MD └── graph_retrieval │ ├── training_scripts │ ├── prompt_tuning │ │ ├── config.yaml │ │ └── p_tuning.py │ └── QLoRA_tuning │ │ ├── config.yaml │ │ └── qlora_adapter.py │ ├── graph_retrieval.py │ └── README.MD ├── project_explainer_ui ├── README.md ├── requirements.txt ├── .gitignore └── ui.py ├── project_processor ├── README.md ├── .gitignore ├── pyproject.toml ├── gh_processor │ ├── github_downloader.py │ ├── __init__.py │ └── file_utils.py └── examples │ └── examples.py ├── .gitignore ├── .github └── workflows │ └── python-app.yml ├── experiments ├── experiment_t5_abs_summarization │ └── experiment_t5_abs_summarization.ipynb └── experiment_bart_ft_abs_summarization │ └── experiment_bart_ft_abs_summarization_eval.ipynb ├── README.md └── LICENSE /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = *.ipynb -------------------------------------------------------------------------------- /project_explainer/gh_explainer/__init__.py: -------------------------------------------------------------------------------- 1 | from .summarize import Explainer -------------------------------------------------------------------------------- /static/ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/static/ui.png -------------------------------------------------------------------------------- /static/logos/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/static/logos/logo.png -------------------------------------------------------------------------------- /graph_rag/evaluation/random/dataset_200_llama3.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/evaluation/random/dataset_200_llama3.pkl -------------------------------------------------------------------------------- /graph_rag/evaluation/random/keras_docs_embedded.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/evaluation/random/keras_docs_embedded.pkl -------------------------------------------------------------------------------- /graph_rag/graph_builder/Example/random/visualisation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/graph_builder/Example/random/visualisation.png -------------------------------------------------------------------------------- /graph_rag/experiments/artifacts/data_keras/index4.md: -------------------------------------------------------------------------------- 1 | # KerasCV 2 | 3 | These guides cover the [KerasCV](/keras_cv/) library. 4 | 5 | ## Available guides 6 | 7 | {{toc}} 8 | -------------------------------------------------------------------------------- /graph_rag/experiments/artifacts/data_keras/index5.md: -------------------------------------------------------------------------------- 1 | # KerasNLP 2 | 3 | These guides cover the [KerasNLP](/keras_nlp/) library. 4 | 5 | ## Available guides 6 | 7 | {{toc}} 8 | -------------------------------------------------------------------------------- /graph_rag/experiments/artifacts/gemma2/gemma2graphIndex.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/gemma2/gemma2graphIndex.pkl -------------------------------------------------------------------------------- /graph_rag/experiments/artifacts/mistral/mistralgraphIndex.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/mistral/mistralgraphIndex.pkl -------------------------------------------------------------------------------- /graph_rag/experiments/artifacts/phi3/graphIndex_phi3_mscb.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/phi3/graphIndex_phi3_mscb.pkl -------------------------------------------------------------------------------- /graph_rag/experiments/artifacts/vizualization/visualisation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/vizualization/visualisation.png -------------------------------------------------------------------------------- /graph_rag/experiments/artifacts/phi3-med/graphIndex_phi3_medium_mscb.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/phi3-med/graphIndex_phi3_medium_mscb.pkl -------------------------------------------------------------------------------- /graph_rag/experiments/artifacts/neural_chat/graphIndex_neuralchat_mscb.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c2siorg/Project-Explainer/HEAD/graph_rag/experiments/artifacts/neural_chat/graphIndex_neuralchat_mscb.pkl -------------------------------------------------------------------------------- /graph_rag/graph_builder/requirements.txt: -------------------------------------------------------------------------------- 1 | llama-index-embeddings-huggingface 2 | llama-index-llms-ollama 3 | llama-index 4 | pyvis 5 | tree-sitter==0.21.3 6 | tree-sitter-languages 7 | tqdm 8 | ragas 9 | datasets 10 | pandas -------------------------------------------------------------------------------- /project_explainer_ui/README.md: -------------------------------------------------------------------------------- 1 | ### Project Explainer UI 2 | 3 | Simple UI using Gradio. 4 | 5 | ### Dependencies 6 | 7 | ``` 8 | pip install -r requirements 9 | ``` 10 | 11 | ### Start UI 12 | 13 | ``` 14 | python ui.py 15 | ``` 16 | -------------------------------------------------------------------------------- /project_explainer_ui/requirements.txt: -------------------------------------------------------------------------------- 1 | gradio 2 | -e git+https://github.com/c2siorg/Project-Explainer.git@main#subdirectory=project_processor&egg=gh_processor 3 | -e git+https://github.com/c2siorg/Project-Explainer.git@main#subdirectory=project_explainer&egg=gh_explainer&egg=gh_explainer -------------------------------------------------------------------------------- /project_processor/README.md: -------------------------------------------------------------------------------- 1 | ## Project Repository Utilities 2 | 3 | A simple python module packed with utilities to process files in a project repository such as git repositories. 4 | 5 | ## Installation 6 | 7 | ``` 8 | pip install git+https://github.com/c2siorg/Project-Explainer.git@main#subdirectory=project_processor&egg=gh_processor 9 | ``` 10 | 11 | -------------------------------------------------------------------------------- /graph_rag/graph_builder/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | file calls series of functions from graph_rag to build knowledge graph 3 | """ 4 | 5 | from tools import initialize_llm, load_directory 6 | from knowledgeGraph import build_graph, save_index 7 | 8 | 9 | initialize_llm() 10 | documents = load_directory("/data") 11 | index = build_graph(documents) 12 | save_index(index) 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | dist/ 11 | build/ 12 | *.egg-info/ 13 | 14 | # Virtual environments 15 | venv/ 16 | env/ 17 | *.env 18 | 19 | # Development tools 20 | .tox/ 21 | .idea/ 22 | .vscode/ 23 | 24 | # IDE-specific files 25 | *.swp 26 | *.swo 27 | *.pyc 28 | .DS_Store 29 | -------------------------------------------------------------------------------- /project_explainer/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | dist/ 11 | build/ 12 | *.egg-info/ 13 | 14 | # Virtual environments 15 | venv/ 16 | env/ 17 | *.env 18 | 19 | # Development tools 20 | .tox/ 21 | .idea/ 22 | .vscode/ 23 | 24 | # IDE-specific files 25 | *.swp 26 | *.swo 27 | *.pyc 28 | .DS_Store 29 | -------------------------------------------------------------------------------- /project_processor/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | dist/ 11 | build/ 12 | *.egg-info/ 13 | 14 | # Virtual environments 15 | venv/ 16 | env/ 17 | *.env 18 | 19 | # Development tools 20 | .tox/ 21 | .idea/ 22 | .vscode/ 23 | 24 | # IDE-specific files 25 | *.swp 26 | *.swo 27 | *.pyc 28 | .DS_Store 29 | -------------------------------------------------------------------------------- /project_explainer_ui/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | dist/ 11 | build/ 12 | *.egg-info/ 13 | 14 | # Virtual environments 15 | venv/ 16 | env/ 17 | *.env 18 | 19 | # Development tools 20 | .tox/ 21 | .idea/ 22 | .vscode/ 23 | 24 | # IDE-specific files 25 | *.swp 26 | *.swo 27 | *.pyc 28 | .DS_Store 29 | -------------------------------------------------------------------------------- /graph_rag/graph_retrieval/training_scripts/prompt_tuning/config.yaml: -------------------------------------------------------------------------------- 1 | Data: 2 | repo_path : '/content/keras-io/templates' 3 | extensions : ['md'] 4 | output_file : 'merged_output.txt' 5 | ModeL: 6 | model: 'bigcode/starcoderbase-1b' 7 | context_length: 128 8 | Training: 9 | masked_language_modelling: False 10 | num_virtual_tokens : 4 11 | num_epochs : 6 12 | learning_rate: 0.0035 13 | output_dir: "/" 14 | max_steps: 4 15 | batch_size: 25 16 | auto_batch_size : False 17 | push_to_hub: False 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /project_explainer/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "gh_explainer" 7 | description = "explains a give github repo" 8 | readme = "README.md" 9 | requires-python = ">=3.7" 10 | license = {text = "Apache 2.0"} 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | ] 14 | dependencies = ["setuptools>=42", "wheel", "transformers", "jinja2", "torch"] 15 | 16 | dynamic = ["version"] 17 | 18 | [tool.setuptools] 19 | py-modules = ["gh_explainer"] 20 | -------------------------------------------------------------------------------- /project_processor/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "gh_processor" 7 | description = "github repo file level processor utils" 8 | readme = "README.md" 9 | requires-python = ">=3.7" 10 | license = {text = "Apache 2.0"} 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | ] 14 | dependencies = ["setuptools>=42", "wheel", "gitpython", "markdown2", "spacy"] 15 | 16 | dynamic = ["version"] 17 | 18 | [tool.setuptools] 19 | py-modules = ["gh_processor"] 20 | -------------------------------------------------------------------------------- /project_explainer_ui/ui.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from gh_explainer import Explainer 3 | 4 | def summarize(summarization_type, github_project_url, github_project_branch="main", huggingface_model_id="gpt2"): 5 | gptExplainer = Explainer(huggingface_model_id) 6 | if summarization_type == "brief": 7 | return gptExplainer.brief(github_url=github_project_url, branch=github_project_branch)["summary"] 8 | return gptExplainer.outline(github_url=github_project_url, branch=github_project_branch)["summary"] 9 | 10 | demo = gr.Interface( 11 | fn=summarize, 12 | inputs=[gr.Dropdown(["brief", "outline"], label="summary level"), "text", "text", "text"], 13 | outputs=["text"], 14 | ) 15 | demo.launch() -------------------------------------------------------------------------------- /graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/config.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | MODEL: "codellama/CodeLlama-7b-Instruct-hf" 3 | SEQ_LENGTH: 2048 4 | LOAD_IN_8BIT: False 5 | 6 | DATA: 7 | REPO_PATH: '/content/keras-io/templates' 8 | SEED: 0 9 | EXTENSIONS: [ 'md' ] 10 | OUTPUT_FILE: 'merged_output.txt'# Column name containing the code content 11 | 12 | TRAINING_ARGUMENTS: 13 | BATCH_SIZE: 64 14 | GR_ACC_STEPS: 1 15 | LR: 5e-4 16 | LR_SCHEDULER_TYPE: "cosine" 17 | WEIGHT_DECAY: 0.01 18 | NUM_WARMUP_STEPS: 30 19 | EVAL_FREQ: 100 20 | SAVE_FREQ: 100 21 | LOG_FREQ: 10 22 | OUTPUT_DIR: 23 | BF16: True 24 | FP16: False 25 | 26 | LORA: 27 | LORA_R: 8 28 | LORA_ALPHA: 32 29 | LORA_DROPOUT: 0.0 30 | LORA_TARGET_MODULES: 31 | 32 | BNB_CONFIG: 33 | USE_NESTED_QUANT: True 34 | BNB_4BIT_COMPUTE_DTYPE: "bfloat16" 35 | 36 | -------------------------------------------------------------------------------- /graph_rag/experiments/artifacts/data_keras/index3.md: -------------------------------------------------------------------------------- 1 | # KerasCV Bounding Boxes 2 | 3 | All KerasCV components that process bounding boxes require a `bounding_box_format` 4 | argument. This argument allows you to seamlessly integrate KerasCV components into 5 | your own workflows while preserving proper behavior of the components themselves. 6 | 7 | Bounding boxes are represented by dictionaries with two keys: `'boxes'` and `'classes'`: 8 | 9 | ``` 10 | { 11 | 'boxes': [batch, num_boxes, 4], 12 | 'classes': [batch, num_boxes] 13 | } 14 | ``` 15 | 16 | To ensure your bounding boxes comply with the KerasCV specification, you can use [`keras_cv.bounding_box.validate_format(boxes)`](https://github.com/keras-team/keras-cv/blob/master/keras_cv/bounding_box/validate_format.py). 17 | 18 | The bounding box formats supported in KerasCV 19 | [are listed in the API docs](/api/keras_cv/bounding_box/formats) 20 | If a format you would like to use is missing, 21 | [feel free to open a GitHub issue on KerasCV](https://github.com/keras-team/keras-cv/issues)! 22 | -------------------------------------------------------------------------------- /project_processor/gh_processor/github_downloader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from git import Repo 3 | import os 4 | 5 | logger = logging.getLogger(__name__) 6 | logger.setLevel(logging.INFO) 7 | 8 | console_handler = logging.StreamHandler() 9 | console_handler.setLevel(logging.INFO) 10 | 11 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 12 | console_handler.setFormatter(formatter) 13 | 14 | logger.addHandler(console_handler) 15 | 16 | 17 | def download_github_repo(repo_url: str, branch: str = "main") -> str: 18 | """ 19 | Download a GitHub repository from the provided URL. 20 | 21 | Args: 22 | repo_url (str): The URL of the GitHub repository. 23 | branch (str): The branch of the GitHub repository. 24 | 25 | Returns: 26 | repo_path (str): Absolute path to downloaded repo 27 | """ 28 | repo_name = repo_url.split("/")[-1].split(".")[0] 29 | repo_path = os.path.abspath(repo_name) 30 | 31 | Repo.clone_from(repo_url, repo_name, branch=branch) 32 | 33 | logger.info(f"Repository '{repo_name}' downloaded successfully!") 34 | return repo_path 35 | -------------------------------------------------------------------------------- /project_processor/gh_processor/__init__.py: -------------------------------------------------------------------------------- 1 | from .github_downloader import download_github_repo 2 | 3 | from .file_utils import (extract_code_blocks_from_markdown, 4 | extract_headings_with_paragraphs_from_markdown, 5 | extract_images_from_markdown, 6 | extract_links_from_markdown, 7 | extract_project_description_from_readme, 8 | extract_tables_from_markdown, 9 | get_files_by_extension, 10 | get_elements_from_markdown_file, 11 | remove_headings_from_markdown_file, 12 | remove_sections_from_markdown, 13 | convert_markdown_file_to_html, 14 | convert_markdown_to_html, 15 | check_phrase_similarity_using_spacyweb, 16 | check_similarity, 17 | remove_code_blocks_from_markdown, 18 | remove_images_from_markdown, 19 | remove_links_from_markdown, 20 | remove_tables_from_markdown) 21 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10" 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | - name: Lint with flake8 32 | run: | 33 | # stop the build if there are Python syntax errors or undefined names 34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 37 | - name: Test with pytest 38 | run: | 39 | bash -c 'pytest .; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' 40 | -------------------------------------------------------------------------------- /graph_rag/graph_builder/README.MD: -------------------------------------------------------------------------------- 1 | # Knowledge Graph Builder 2 | 3 | This project lets you build a Knowledge Graph from unstructured data (.md, .py files for now). 4 | 5 | ## Table of Contents 6 | 7 | - [Installation from Source](#installation-from-source) 8 | - [Usage](#usage) 9 | 10 | 11 | 12 | 13 | ## Installation from Source 14 | 15 | Follow these instructions to set up the project: 16 | 17 | ```bash 18 | git clone https://github.com/debrupf2946/KnowledgeGraphBuilder.git 19 | cd KnowledgeGraphBuilder 20 | pip3 install -r requirements.txt 21 | ``` 22 | 23 | ## Usage 24 | 25 | ### Data Preparation 26 | 27 | 1. First, create or import a data directory at the root folder containing documents (.md files). 28 | 2. Copy the path of the directory. 29 | 3. Load and chunk the documents using `load_directory(PATH)`. 30 | 31 | ```python 32 | documents = load_directory("/data") 33 | ``` 34 | 35 | ### LLM Setup 36 | 37 | Users need to set up the LLM (llama3) locally to build the Knowledge Graph. 38 | 39 | 1. Initialize the LLM with `initialize_llm()`. 40 | 2. The default parameters are: 41 | - `base_url="http://localhost:11434"` (Ollama server) 42 | - `model="llama3"` 43 | - `chunk_size = 512` 44 | 3. Change the parameters as needed. 45 | 46 | ```python 47 | initialize_llm() 48 | ``` 49 | 50 | ### Build Graph Index 51 | 52 | 1. Build the Knowledge Graph using the [documents](#data-preparation). 53 | 2. Call `build_graph(documents)` to create an index. 54 | 3. This will also save `Graph_visualization.html`, which can be opened in a browser to visualize the Knowledge Graph. 55 | 56 | ```python 57 | index = build_graph(documents) 58 | ``` 59 | 60 | 4. Save the `index` as a pickle file. 61 | 62 | ```python 63 | save_index(index) 64 | ``` 65 | following example can referred for detailed implementation 66 | [GraphRag Example Notebook](Example/GraphRagExample.ipynb) 67 | -------------------------------------------------------------------------------- /graph_rag/evaluation/evaluation_llama_index.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script evaluates a RagDataset using a RagEvaluatorPack, which assesses query engines by benchmarking against 3 | labeled data using LLMs and embeddings. 4 | 5 | Functions: 6 | - evaluate: Evaluates the query engine using a labeled RAG dataset and specified models for both the LLM and embeddings. 7 | """ 8 | 9 | from llama_index.core.llama_dataset import LabelledRagDataset 10 | from llama_index.core.llama_pack import download_llama_pack 11 | from llama_index.llms.ollama import Ollama 12 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 13 | 14 | 15 | 16 | 17 | 18 | def evaluate( 19 | RAG_DATASET: str, 20 | query_engine: object, 21 | ollama_model: str = "llama3", 22 | embedd_model: str = "microsoft/codebert-base", 23 | ): 24 | """ 25 | Evaluates a RAG dataset by using a query engine and benchmarks it using LLM and embedding models. 26 | 27 | Args: 28 | RAG_DATASET: Path to the JSON file containing the labeled RAG dataset. 29 | query_engine: The query engine to evaluate. 30 | ollama_model: The LLM model to use for evaluation (default: "llama3"). 31 | embedd_model: The Hugging Face embedding model to use for evaluation (default: "microsoft/codebert-base"). 32 | 33 | Returns: 34 | A DataFrame containing the benchmarking results, including LLM calls and evaluations. 35 | """ 36 | 37 | RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./rag_evaluator_pack") 38 | rag_dataset = LabelledRagDataset.from_json(RAG_DATASET) 39 | rag_evaluator_pack = RagEvaluatorPack( 40 | rag_dataset=rag_dataset, 41 | query_engine=query_engine, 42 | judge_llm=Ollama(base_url="http://localhost:11434", model=ollama_model), 43 | embed_model=HuggingFaceEmbedding(model_name=embedd_model), 44 | ) 45 | benchmark_df = await rag_evaluator_pack.arun( 46 | batch_size=5, # batches the number of llm calls to make 47 | sleep_time_in_seconds=1, # seconds to sleep before making an api call 48 | ) 49 | return benchmark_df 50 | -------------------------------------------------------------------------------- /project_processor/examples/examples.py: -------------------------------------------------------------------------------- 1 | from gh_processor import download_github_repo, extract_headings_with_paragraphs_from_markdown, get_files_by_extension 2 | 3 | git_url = "https://github.com/c2siorg/Project-Explainer.git" 4 | 5 | repo_path = download_github_repo(git_url) 6 | 7 | print(repo_path) 8 | 9 | markdown_files = get_files_by_extension(repo_path, [".md"]) 10 | 11 | headings_with_content = {} 12 | 13 | print(markdown_files) 14 | 15 | for markdown_file in markdown_files: 16 | print(markdown_file) 17 | headings_with_content[markdown_file] = extract_headings_with_paragraphs_from_markdown(markdown_file) 18 | 19 | print(headings_with_content) 20 | 21 | ## Output 22 | 23 | # {'/Users/sripravallika/Project-Explainer/Project-Explainer/README.md': {'Project-Explainer': 'Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.'}} 24 | -------------------------------------------------------------------------------- /graph_rag/graph_builder/knowledgeGraph.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains the functions to build and save the KnowledgeGraph Index and save it as a pickle-file 3 | """ 4 | 5 | from llama_index.core import StorageContext 6 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 7 | from llama_index.core import KnowledgeGraphIndex 8 | from llama_index.core.graph_stores import SimpleGraphStore 9 | from pyvis.network import Network 10 | import os 11 | import pickle 12 | 13 | 14 | def build_graph( 15 | documents: str, 16 | llm: str = None, 17 | max_triplets_per_chunk: int = 10, 18 | embeddings: str = "microsoft/codebert-base", 19 | include_embeddings: bool = False, 20 | ): 21 | """ 22 | This function builds KnowledgeGraph Index that can be queried 23 | Args: 24 | documents: llama-index Document type object 25 | llm: 26 | max_triplets_per_chunk: Max triplets that can be extracted from each document chunk defaults:3 27 | embeddings: Hugging-Face Embeddings model name default: microsoft/codebert-base 28 | 29 | Returns: 30 | Knowledge Graph-index, also saves html visualization file 31 | """ 32 | try: 33 | graph_store = SimpleGraphStore() 34 | storage_context = StorageContext.from_defaults(graph_store=graph_store) 35 | index = KnowledgeGraphIndex.from_documents( 36 | documents, 37 | max_triplets_per_chunk=max_triplets_per_chunk, 38 | llm=llm, 39 | embed_model=HuggingFaceEmbedding(model_name=embeddings), 40 | storage_context=storage_context, 41 | include_embeddings=include_embeddings, 42 | ) 43 | print("KG built successfully!") 44 | 45 | os.makedirs("results", exist_ok=True) 46 | g = index.get_networkx_graph() 47 | net = Network(notebook=True, cdn_resources="in_line", directed=True) 48 | net.from_nx(g) 49 | net.show("Graph_visualization.html") 50 | return index 51 | except Exception as e: 52 | print(f"Error building graph: {e}") 53 | return None 54 | 55 | 56 | def save_index(index, output_dir_path: str = "Results/"): 57 | """ 58 | Serializes the index object, so that it can be loaded and used later 59 | Args: 60 | index: Graph-Index object 61 | 62 | Returns: 63 | Saves pickle file of the Graph-Index 64 | """ 65 | try: 66 | os.makedirs(output_dir_path[:-1], exist_ok=True) 67 | with open(output_dir_path + "graphIndex", "wb") as f: 68 | pickle.dump(index, f) 69 | print("Index saved successfully!") 70 | except Exception as e: 71 | print(f"Error saving index: {e}") 72 | -------------------------------------------------------------------------------- /graph_rag/graph_retrieval/graph_retrieval.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains methods for loading graph_index from pkl file and retrieval of graph_index 3 | """ 4 | 5 | from ..graph_builder.tools import initialize_llm 6 | import pickle 7 | 8 | 9 | def get_index_from_pickle( 10 | file_path: str = "results/graphIndex.pkl", 11 | ): 12 | """ 13 | Deserializes a .pkl file to get the graph_index. 14 | Args: 15 | file_path (str): The path to the .pkl file. 16 | 17 | Returns: 18 | object: The deserialized llama_index graph_index object. 19 | 20 | """ 21 | try: 22 | with open(file_path, "rb") as file: 23 | index = pickle.load(file) 24 | return index 25 | except FileNotFoundError: 26 | print(f"File not found: {file_path}") 27 | raise 28 | except IOError as e: 29 | print(f"Error reading file: {e}") 30 | raise 31 | except pickle.UnpicklingError as e: 32 | print(f"Error deserializing file: {e}") 33 | raise 34 | 35 | 36 | def get_query_engine(index, with_embedding: bool = False, similarity_top_k: int = 5): 37 | """ 38 | create query-engine with preferred settings that is used to query graph_index 39 | Args: 40 | index (object): llama_index graph_index object 41 | with_embedding (bool): switch to True to query graph_index with embeddings Default:False 42 | similarity_top_k (int): Top number of chunks that is to be provided as context to llm for response to given query 43 | 44 | Returns: 45 | object: llama_index query_engine object 46 | 47 | """ 48 | if index is None: 49 | raise ValueError("The index must not be None.") 50 | try: 51 | initialize_llm() 52 | if with_embedding: 53 | query_engine = index.as_query_engine( 54 | include_text=True, 55 | response_mode="tree_summarize", 56 | embedding_mode="hybrid", 57 | similarity_top_k=similarity_top_k, 58 | ) 59 | else: 60 | query_engine = index.as_query_engine( 61 | include_text=True, response_mode="tree_summarize" 62 | ) 63 | return query_engine 64 | except Exception as e: 65 | print(f"An error occurred while creating the query engine: {e}") 66 | raise 67 | 68 | 69 | def graph_query(query: str, query_engine): 70 | """ 71 | method to query graph_index 72 | Args: 73 | query (str): query that is to be answered using graph_rag 74 | query_engine (object): llama_index query_engine object 75 | 76 | Returns: 77 | str: response to the query in string 78 | 79 | """ 80 | if not query: 81 | raise ValueError("The query must not be empty or None.") 82 | 83 | try: 84 | response = query_engine.query(query) 85 | print(response.response) 86 | return response 87 | except Exception as e: 88 | print(f"An error occurred while querying: {e}") 89 | raise 90 | -------------------------------------------------------------------------------- /graph_rag/graph_retrieval/README.MD: -------------------------------------------------------------------------------- 1 | # Graph Index Retriever 2 | 3 | This module provides methods for loading a graph index from a pickle file and querying it using a `llama_index` query engine. 4 | 5 | ## Usage 6 | 7 | ### Loading Graph Index from Pickle File 8 | 9 | 1. Load the graph index from a pickle file using `get_index_from_pickle(file_path)`. 10 | 2. By default, the file path is set to `"results/graphIndex.pkl"`. 11 | 12 | ```python 13 | from graph_rag.graph_retrieval.graph_retrieval import get_index_from_pickle 14 | 15 | index = get_index_from_pickle("path/to/your/graphIndex.pkl") 16 | ``` 17 | 18 | ### Setting Up the Query Engine 19 | 20 | 1. Initialize the LLM with `initialize_llm()`. 21 | 2. Create a query engine using `get_query_engine(index, with_embedding=False, similarity_top_k=5)`. 22 | - `index`: The loaded `llama_index` graph index object. 23 | - `with_embedding` (bool): Set to `True` to query the graph index with embeddings. Default is `False`. 24 | - `similarity_top_k` (int): Number of top similar chunks to provide as context to LLM for responding to the query. Default is `5`. 25 | 26 | ```python 27 | from graph_rag.graph_retrieval.graph_retrieval import get_query_engine 28 | 29 | query_engine = get_query_engine(index, with_embedding=False, similarity_top_k=5) 30 | ``` 31 | 32 | ### Querying the Graph Index 33 | 34 | 1. Query the graph index using `graph_query(query, query_engine)`. 35 | - `query` (str): The query to be answered using `graph_rag`. 36 | - `query_engine`: The `llama_index` query engine object. 37 | 38 | ```python 39 | from graph_rag.graph_retrieval.graph_retrieval import graph_query 40 | 41 | response = graph_query("Your query here", query_engine) 42 | print(response) 43 | ``` 44 | ## Advanced Training with QLoRA and P-Tuning 45 | 46 | >fine-tuning LLMs on data(masked language or Next toke Prediction) for few epochs, may result in better retrieval and response 47 | 48 | ### 1. Setup 49 | 50 | To use QLoRA and P-Tuning, ensure your environment is set up with the required libraries and that your model and dataset configurations are defined in a `config.yaml` file. 51 | 52 | ### 2. Finetuning with QLoRA 53 | 54 | Use the QLoRA method for efficient fine-tuning by passing the appropriate configurations in your `config.yaml`. This method is ideal when working with large models on limited hardware. 55 | 56 | ```bash 57 | python qlora_adapter.py --config path/to/config.yaml 58 | ``` 59 | Execute the training script with the `--config` argument to specify your configuration file: 60 | 61 | ### 3. Fine-Tuning with P-Tuning 62 | 63 | P-Tuning allows for parameter-efficient prompt-based fine-tuning. Adjust the number of virtual tokens and other related parameters in the `config.yaml` to customize the training process. 64 | 65 | ```bash 66 | python p_tuning.py--config path/to/config.yaml 67 | ``` 68 | Execute the training script with the `--config` argument to specify your configuration file: 69 | 70 | 71 | 72 | 73 | 74 | 75 | This will start the training process using the specified method (QLoRA or P-Tuning) and configurations. 76 | 77 | 78 | -------------------------------------------------------------------------------- /graph_rag/experiments/artifacts/data_keras/index1.md: -------------------------------------------------------------------------------- 1 | # KerasTuner 2 | 3 | Star 4 | 5 | KerasTuner is an easy-to-use, scalable hyperparameter optimization framework 6 | that solves the pain points of hyperparameter search. Easily configure your 7 | search space with a define-by-run syntax, then leverage one of the available 8 | search algorithms to find the best hyperparameter values for your models. 9 | KerasTuner comes with Bayesian Optimization, Hyperband, and Random Search algorithms 10 | built-in, and is also designed to be easy for researchers to extend in order to 11 | experiment with new search algorithms. 12 | 13 | --- 14 | ## Quick links 15 | 16 | * [Getting started with KerasTuner](/guides/keras_tuner/getting_started/) 17 | * [KerasTuner developer guides](/guides/keras_tuner/) 18 | * [KerasTuner API reference](/api/keras_tuner/) 19 | * [KerasTuner on GitHub](https://github.com/keras-team/keras-tuner) 20 | 21 | 22 | --- 23 | ## Installation 24 | 25 | Install the latest release: 26 | 27 | ``` 28 | pip install keras-tuner --upgrade 29 | ``` 30 | 31 | You can also check out other versions in our 32 | [GitHub repository](https://github.com/keras-team/keras-tuner). 33 | 34 | 35 | --- 36 | ## Quick introduction 37 | 38 | Import KerasTuner and TensorFlow: 39 | 40 | ```python 41 | import keras_tuner 42 | import keras 43 | ``` 44 | 45 | Write a function that creates and returns a Keras model. 46 | Use the `hp` argument to define the hyperparameters during model creation. 47 | 48 | ```python 49 | def build_model(hp): 50 | model = keras.Sequential() 51 | model.add(keras.layers.Dense( 52 | hp.Choice('units', [8, 16, 32]), 53 | activation='relu')) 54 | model.add(keras.layers.Dense(1, activation='relu')) 55 | model.compile(loss='mse') 56 | return model 57 | ``` 58 | 59 | Initialize a tuner (here, `RandomSearch`). 60 | We use `objective` to specify the objective to select the best models, 61 | and we use `max_trials` to specify the number of different models to try. 62 | 63 | ```python 64 | tuner = keras_tuner.RandomSearch( 65 | build_model, 66 | objective='val_loss', 67 | max_trials=5) 68 | ``` 69 | 70 | Start the search and get the best model: 71 | 72 | ```python 73 | tuner.search(x_train, y_train, epochs=5, validation_data=(x_val, y_val)) 74 | best_model = tuner.get_best_models()[0] 75 | ``` 76 | 77 | To learn more about KerasTuner, check out [this starter guide](https://keras.io/guides/keras_tuner/getting_started/). 78 | 79 | 80 | --- 81 | ## Citing KerasTuner 82 | 83 | If KerasTuner helps your research, we appreciate your citations. 84 | Here is the BibTeX entry: 85 | 86 | ```bibtex 87 | @misc{omalley2019kerastuner, 88 | title = {KerasTuner}, 89 | author = {O'Malley, Tom and Bursztein, Elie and Long, James and Chollet, Fran\c{c}ois and Jin, Haifeng and Invernizzi, Luca and others}, 90 | year = 2019, 91 | howpublished = {\url{https://github.com/keras-team/keras-tuner}} 92 | } 93 | ``` 94 | -------------------------------------------------------------------------------- /graph_rag/evaluation/ragas_evaluation/prompts.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains PROMPTS that are passed to llms to generate and critique Test-Dataset for Graph_Rag 3 | """ 4 | 5 | QA_generation_prompt = """ 6 | Your task is to write a factoid question and an answer given a context. 7 | Your factoid question should be answerable with a specific, concise piece of factual information from the context. 8 | Your factoid question should be formulated in the same style as questions users could ask in a search engine. 9 | This means that your factoid question MUST NOT mention something like "according to the passage" or "context". 10 | 11 | Provide your answer as follows: 12 | 13 | Output::: 14 | Factoid question: (your factoid question) 15 | Answer: (your answer to the factoid question) 16 | 17 | Now here is the context. 18 | 19 | Context: {context}\n 20 | Output:::""" 21 | 22 | question_groundedness_critique_prompt = """ 23 | You will be given a context and a question. 24 | Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context. 25 | Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context. 26 | 27 | Provide your answer as follows: 28 | 29 | Answer::: 30 | Evaluation: (your rationale for the rating, as a text) 31 | Total rating: (your rating, as a number between 1 and 5) 32 | 33 | You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer. 34 | 35 | Now here are the question and context. 36 | 37 | Question: {question}\n 38 | Context: {context}\n 39 | Answer::: """ 40 | 41 | question_relevance_critique_prompt = """ 42 | You will be given a question. 43 | Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem. 44 | Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful. 45 | 46 | Provide your answer as follows: 47 | 48 | Answer::: 49 | Evaluation: (your rationale for the rating, as a text) 50 | Total rating: (your rating, as a number between 1 and 5) 51 | 52 | You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer. 53 | 54 | Now here is the question. 55 | 56 | Question: {question}\n 57 | Answer::: """ 58 | 59 | question_standalone_critique_prompt = """ 60 | You will be given a question. 61 | Your task is to provide a 'total rating' representing how context-independant this question is. 62 | Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself. 63 | For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1. 64 | The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about. 65 | 66 | For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context. 67 | 68 | Provide your answer as follows: 69 | 70 | Answer::: 71 | Evaluation: (your rationale for the rating, as a text) 72 | Total rating: (your rating, as a number between 1 and 5) 73 | 74 | You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer. 75 | 76 | Now here is the question. 77 | 78 | Question: {question}\n 79 | Answer::: """ 80 | -------------------------------------------------------------------------------- /static/logos/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
Project Explainer
Project Explainer
Text is not SVG - cannot display
-------------------------------------------------------------------------------- /graph_rag/graph_builder/tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains functions for initializing llm for 3 | 1. building KnowledgeGraph 4 | 2. loading documents from directory (also function for splitting code files) 5 | 3. converting llama-index Node to llama-index Documents 6 | """ 7 | 8 | from llama_index.llms.ollama import Ollama 9 | from llama_index.core import SimpleDirectoryReader 10 | from llama_index.core.node_parser import CodeSplitter 11 | from llama_index.core import Document 12 | from llama_index.core import Settings 13 | 14 | 15 | def initialize_llm( 16 | base_url: str = "http://localhost:11434", 17 | model: str = "llama3", 18 | chunk_size: int = 512, 19 | ): 20 | """ 21 | Initializes the llm for building the KnowledgeGraph 22 | Args: 23 | base_url: The ollama server URL where the model is listening 24 | model: The model string that ollama is hosting and will be used to build the KnowledgeGraph 25 | chunk_size: The Documents uploaded will be chunked, it represents size of each chunk 26 | 27 | Returns: 28 | None 29 | """ 30 | try: 31 | llm = Ollama(base_url=base_url, model=model) 32 | Settings.llm = llm 33 | Settings.chunk_size = chunk_size 34 | print(f"{model} initialized successfully!") 35 | except Exception as e: 36 | print(f"Error initializing LLM: {e}") 37 | 38 | 39 | def code_splitting(documents, language: str = "python"): 40 | """ 41 | If the KnowledgeGraph is to be built for code-files then files are split using this function 42 | Args: 43 | documents: llama-index Document type object, then coding-files Document 44 | language: The language of coding-file 45 | 46 | Returns: 47 | nodes: Split code chunks, llama-index Nodes type object 48 | """ 49 | try: 50 | splitter = CodeSplitter( 51 | language=language, 52 | chunk_lines=30, # lines per chunk 53 | chunk_lines_overlap=6, # lines overlap between chunks 54 | max_chars=1500, # max chars per chunk 55 | ) 56 | nodes = splitter.get_nodes_from_documents(documents) 57 | print(f"{len(nodes)} nodes created successfully!") 58 | return nodes 59 | except Exception as e: 60 | print(f"Error splitting code: {e}") 61 | return [] 62 | 63 | 64 | def convert_nodes_to_docs(nodes): 65 | """ 66 | Converts llama-index Nodes Type object to llama-index Document Type objects 67 | Args: 68 | nodes: llama-index Nodes type object 69 | Returns: 70 | llama-index Document Type objects 71 | """ 72 | try: 73 | documents_from_nodes = [ 74 | Document(text=node.text, metadata=node.metadata) for node in nodes 75 | ] 76 | print( 77 | f"{len(documents_from_nodes)} number of documents converted successfully!" 78 | ) 79 | return documents_from_nodes 80 | except Exception as e: 81 | print(f"Error converting nodes to documents: {e}") 82 | return [] 83 | 84 | 85 | def load_directory( 86 | directory_path: str, code_file: bool = False, language: str = "python" 87 | ): 88 | """ 89 | Loads the documentation-directory, does preprocessing and chunking depending on code_file parameter 90 | Args: 91 | directory_path: Path to the Files Directory from which Knowledge graph is to be made 92 | code_file: Bool that specifies that given directory contains code files or not 93 | language: language of the code-files 94 | Returns: 95 | llama-index Document Type objects 96 | """ 97 | try: 98 | documents = SimpleDirectoryReader(directory_path).load_data() 99 | except Exception as e: 100 | print(f"Error loading directory: {e}") 101 | return [] 102 | 103 | try: 104 | if code_file: 105 | nodes = code_splitting(documents, language) 106 | docs = convert_nodes_to_docs(nodes) 107 | print(f"{len(docs)} documents loaded successfully!") 108 | return docs 109 | 110 | print(f"{len(documents)} documents loaded successfully!") 111 | return documents 112 | except Exception as e: 113 | print(f"Error processing documents: {e}") 114 | return [] 115 | -------------------------------------------------------------------------------- /graph_rag/evaluation/ragas_evaluation/evaluation_ragas.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script loads a pre-processed dataset, slices it for batch evaluation, and runs a series of metrics to evaluate the 3 | performance of a query engine using a language model and embeddings. 4 | 5 | Functions: 6 | - load_test_dataset: Loads a test dataset from a pickle file. 7 | - slice_data: Slices the dataset into batches for evaluation. 8 | - evaluate: Runs evaluation on the sliced dataset using specified metrics, LLMs, and embeddings. 9 | 10 | """ 11 | 12 | import pickle 13 | import pandas as pd 14 | from datasets import Dataset 15 | from ragas.integrations.llama_index import evaluate 16 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 17 | from ragas.metrics.critique import harmfulness 18 | from llama_index.llms.ollama import Ollama 19 | from ragas.metrics import ( 20 | faithfulness, 21 | answer_relevancy, 22 | context_precision, 23 | context_recall, 24 | ) 25 | 26 | 27 | def load_test_dataset( 28 | data: str, 29 | ): 30 | """ 31 | Loads a test dataset from a pickle file. 32 | 33 | Args: 34 | data: The path to the dataset file in pickle format. 35 | 36 | Returns: 37 | A dictionary representing the loaded dataset or an empty dictionary if loading fails due to EOFError. 38 | """ 39 | try: 40 | with open(data, "rb") as f: 41 | dataset = pickle.load(f) 42 | except EOFError: 43 | print("EOFError: The file may be corrupted or incomplete loading empty dictionary.") 44 | dataset = [] 45 | return dataset 46 | 47 | 48 | def slice_data(i: int, k: int, dataset: list): 49 | """ 50 | Slices the dataset into smaller chunks for batch processing. 51 | 52 | Args: 53 | i: The starting index for the slice. 54 | k: The size of the slice (number of records to include in each batch). 55 | dataset: The dictionary representing the dataset to be sliced. 56 | 57 | Returns: 58 | A dictionary containing the sliced dataset with renamed columns for consistency with the evaluation process. 59 | """ 60 | 61 | hf_dataset = Dataset.from_list(dataset[i : i + k]) 62 | hf_dataset = hf_dataset.rename_column("context", "contexts") 63 | hf_dataset = hf_dataset.rename_column("answer", "ground_truth") 64 | ds_dict = hf_dataset.to_dict() 65 | return ds_dict 66 | 67 | 68 | def evaluate( 69 | query_engine: object, 70 | dataset: object, 71 | batch: int = 4, 72 | metrics: list = [ 73 | faithfulness, 74 | answer_relevancy, 75 | context_precision, 76 | context_recall, 77 | ], 78 | llm: object = Ollama(base_url="http://localhost:11434", model="codellama"), 79 | embeddings=HuggingFaceEmbedding(model_name="microsoft/codebert-base"), 80 | ): 81 | """ 82 | Evaluates the performance of a query engine on a dataset using various metrics and a language model. 83 | 84 | Args: 85 | query_engine: The query engine to be evaluated. 86 | dataset: The dataset to be evaluated against. 87 | batch: The number of records to process in each batch (default: 4). 88 | metrics: A list of metrics to be used for evaluation (default: faithfulness, answer relevancy, context precision, and context recall). 89 | llm: The language model to be used for evaluation (default: Ollama with model 'codellama'). 90 | embeddings: The embedding model to be used (default: HuggingFaceEmbedding with 'microsoft/codebert-base'). 91 | 92 | Returns: 93 | A pandas DataFrame containing the evaluation results for each batch. 94 | """ 95 | 96 | rows_count = len(next(iter(dataset.values()))) 97 | 98 | results_df = pd.DataFrame() 99 | 100 | for i in range(0, rows_count, batch): 101 | 102 | batch_data = slice_data(i, batch, dataset=dataset) 103 | 104 | result = evaluate( 105 | query_engine=query_engine, 106 | metrics=metrics, 107 | dataset=batch_data, 108 | llm=llm, 109 | embeddings=embeddings, 110 | ) 111 | 112 | rdf = result.to_pandas() 113 | results_df = pd.concat([results_df, rdf], ignore_index=True) 114 | print(f"Processed batch {i // batch + 1}:") 115 | print(rdf) 116 | print(results_df) 117 | results_df.to_csv("results.csv", index=False) 118 | return results_df 119 | -------------------------------------------------------------------------------- /graph_rag/evaluation/ragas_evaluation/QA_graphrag_testdataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script contains functions to generate question-answer pairs from input documents using a language model, 3 | and critique them based on various criteria like groundedness, relevance, and standalone quality. 4 | 5 | Functions: 6 | - get_response: Sends a request to a language model API to generate responses based on a provided prompt. 7 | - qa_generator: Generates a specified number of question-answer pairs from input documents. 8 | - critique_qa: Critiques the generated QA pairs based on groundedness, relevance, and standalone quality. 9 | """ 10 | 11 | from prompts import * 12 | import pandas as pd 13 | import random 14 | from tqdm.auto import tqdm 15 | import requests 16 | 17 | 18 | def get_response( 19 | prompt: str, url: str = "http://localhost:11434/api/generate", model: str = "llama3" 20 | ): 21 | """ 22 | Sends a prompt ollama API and retrieves the generated response. 23 | 24 | Args: 25 | prompt:The text input that the model will use to generate a response. 26 | url: The API endpoint for the model (default: "http://localhost:11434/api/generate"). 27 | model: The model to be used for generation (default: "llama3"). 28 | 29 | Returns: 30 | The generated response from the language model as a string. 31 | """ 32 | 33 | payload = {"model": model, "prompt": prompt, "stream": False} 34 | response = requests.post(url, json=payload) 35 | resp = response.json() 36 | return resp["response"] 37 | 38 | 39 | def qa_generator( 40 | documents: object, 41 | N_GENERATIONS: int = 20, 42 | ): 43 | """ 44 | Generates a specified number of question-answer pairs from the provided documents. 45 | 46 | Args: 47 | documents: A collection of document objects to generate QA pairs from. 48 | N_GENERATIONS: The number of question-answer pairs to generate (default: 20). 49 | 50 | Returns: 51 | A list of dictionaries, each containing the generated context, question, answer, and source document metadata. 52 | """ 53 | print(f"Generating {N_GENERATIONS} QA couples...") 54 | 55 | outputs = [] 56 | for sampled_context in tqdm(random.sample(documents, N_GENERATIONS)): 57 | # Generate QA couple 58 | output_QA_couple = get_response( 59 | QA_generation_prompt.format(context=sampled_context.text) 60 | ) 61 | try: 62 | question = output_QA_couple.split("Factoid question: ")[-1].split( 63 | "Answer: " 64 | )[0] 65 | answer = output_QA_couple.split("Answer: ")[-1] 66 | assert len(answer) < 300, "Answer is too long" 67 | outputs.append( 68 | { 69 | "context": sampled_context.text, 70 | "question": question, 71 | "answer": answer, 72 | "source_doc": sampled_context.metadata, 73 | } 74 | ) 75 | except: 76 | continue 77 | df = pd.DataFrame(outputs) 78 | df.to_csv("QA.csv") 79 | return outputs 80 | 81 | 82 | def critique_qa( 83 | outputs: list, 84 | ): 85 | """ 86 | Critiques the generated question-answer pairs based on groundedness, relevance, and standalone quality. 87 | 88 | Args: 89 | outputs: A list of dictionaries containing generated QA pairs to be critiqued. 90 | 91 | Returns: 92 | The critiqued QA pairs with additional fields for groundedness, relevance, and standalone quality scores and evaluations. 93 | """ 94 | print("Generating critique for each QA couple...") 95 | for output in tqdm(outputs): 96 | evaluations = { 97 | "groundedness": get_response( 98 | question_groundedness_critique_prompt.format( 99 | context=output["context"], question=output["question"] 100 | ), 101 | ), 102 | "relevance": get_response( 103 | question_relevance_critique_prompt.format(question=output["question"]), 104 | ), 105 | "standalone": get_response( 106 | question_standalone_critique_prompt.format(question=output["question"]), 107 | ), 108 | } 109 | try: 110 | for criterion, evaluation in evaluations.items(): 111 | score, eval = ( 112 | int(evaluation.split("Total rating: ")[-1].strip()), 113 | evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1], 114 | ) 115 | output.update( 116 | { 117 | f"{criterion}_score": score, 118 | f"{criterion}_eval": eval, 119 | } 120 | ) 121 | except Exception as e: 122 | continue 123 | generated_questions = pd.DataFrame.from_dict(outputs) 124 | generated_questions = generated_questions.loc[ 125 | (generated_questions["groundedness_score"] >= 4) 126 | & (generated_questions["relevance_score"] >= 4) 127 | & (generated_questions["standalone_score"] >= 4) 128 | ] 129 | generated_questions.to_csv("generated_questions.csv") 130 | return outputs 131 | -------------------------------------------------------------------------------- /project_explainer/examples/examples.py: -------------------------------------------------------------------------------- 1 | from gh_explainer import Explainer 2 | 3 | gptExplainer = Explainer("gpt2") 4 | 5 | print(gptExplainer.brief("https://github.com/c2siorg/Project-Explainer.git")) 6 | 7 | ## output 8 | 9 | # {'prompt': {'prompt': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.'}, 'prepared_prompt': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.\nExplain the above : ', 'summary': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.\nExplain the above : \xa0The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model.'} -------------------------------------------------------------------------------- /graph_rag/graph_builder/Example/build_with_relic.MD: -------------------------------------------------------------------------------- 1 | # Knowledge Graph with Relik and Llama-Index 2 | 3 | This markdown file demonstrates an experiment in building a knowledge graph using the `Relik` and `Llama-Index` Property Graphs. The steps include coreference resolution with `Spacy`, relation extraction with `Relik`, and knowledge graph construction with `llama-index PropertyGraphs`,stored in `neo4j`. 4 | 5 | ## Import Necessary Libraries 6 | 7 | Import the essential libraries required for the experiment. These include NLP tools (`Spacy`, `coreferee`), document readers, large language models (LLMs), embeddings, and Neo4j for graph storage. 8 | 9 | ```python 10 | import spacy, coreferee 11 | from llama_index.core import SimpleDirectoryReader 12 | import nest_asyncio 13 | from llama_index.llms.ollama import Ollama 14 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 15 | from llama_index.core import PropertyGraphIndex 16 | from llama_index.core import Settings 17 | from llama_index.extractors.relik.base import RelikPathExtractor 18 | from llama_index.graph_stores.neo4j import Neo4jPGStore 19 | ``` 20 | 21 | ## Coreference Resolution Function 22 | 23 | Sets up a function to resolve coreferences in a text. This is crucial for ensuring that the references to entities like "she" or "it" are correctly linked back to their antecedents,removing de-duplication of nodes from knowledge graph. 24 | 25 | ```python 26 | coref_nlp = spacy.load('en_core_web_lg') 27 | coref_nlp.add_pipe('coreferee') 28 | 29 | def coref_text(text): 30 | coref_doc = coref_nlp(text) 31 | resolved_text = "" 32 | 33 | for token in coref_doc: 34 | repres = coref_doc._.coref_chains.resolve(token) 35 | if repres: 36 | resolved_text += " " + " and ".join( 37 | [ 38 | t.text 39 | if t.ent_type_ == "" 40 | else [e.text for e in coref_doc.ents if t in e][0] 41 | for t in repres 42 | ] 43 | ) 44 | else: 45 | resolved_text += " " + token.text 46 | 47 | return resolved_text 48 | ``` 49 | 50 | ### Example Usage of Coreference Resolution 51 | 52 | An example is provided to demonstrate how the `coref_text` function resolves references in the text. 53 | 54 | ```python 55 | coref_text("alice is great. she can study for long hours and remember") 56 | # Output: alice is great. alice can study for long hours and remember 57 | ``` 58 | 59 | ## Load and Process Documents 60 | 61 | The documents are loaded from a specified directory and processed with the coreference resolution function to prepare them for knowledge graph construction. 62 | 63 | ```python 64 | documents = SimpleDirectoryReader(input_dir='/content/data').load_data() 65 | len(documents) 66 | 67 | for doc in documents: 68 | doc.text = coref_text(doc.text) 69 | ``` 70 | 71 | ## Initialize Relik Path Extractor 72 | 73 | Here, the `RelikPathExtractor` is initialized, which will be used to extract relationships between entities from the processed documents. 74 | 75 | ```python 76 | relik = RelikPathExtractor( 77 | model="relik-ie/relik-relation-extraction-small", model_config={"skip_metadata": True} 78 | ) 79 | ``` 80 | 81 | ## Set Up Language Model and Embeddings 82 | 83 | This section configures the LLM (`Ollama`) and the embedding model (`HuggingFaceEmbedding`) to be used for generating embeddings for the knowledge graph. 84 | 85 | ```python 86 | llm = Ollama(base_url="http://localhost:11434", model="llama3.1") 87 | embed_model = HuggingFaceEmbedding(model_name="microsoft/codebert-base") 88 | Settings.llm = llm 89 | ``` 90 | 91 | ## Configure Neo4j Graph Store 92 | 93 | Sets up the connection to a Neo4j database, where the knowledge graph will be stored. Ensure to replace the placeholder for the password with your actual Neo4j password. 94 | 95 | ```python 96 | username = "neo4j" 97 | password = "*****************************" 98 | url = "neo4j+s://45256b03.databases.neo4j.io" 99 | 100 | graph_store = Neo4jPGStore( 101 | username=username, 102 | password=password, 103 | url=url, 104 | refresh_schema=False 105 | ) 106 | ``` 107 | 108 | ## Build the Knowledge Graph 109 | 110 | Here, the knowledge graph is constructed from the processed documents using the configured tools: `Relik`, `Ollama`, `HuggingFaceEmbedding`, and `Neo4j`. 111 | 112 | ```python 113 | index = PropertyGraphIndex.from_documents( 114 | documents, 115 | kg_extractors=[relik], 116 | llm=llm, 117 | embed_model=embed_model, 118 | property_graph_store=graph_store, 119 | show_progress=True, 120 | ) 121 | ``` 122 | ![Alt text](random/visualisation.png) 123 | 124 | 125 | ## Query the Knowledge Graph 126 | 127 | Finally, a query engine is created, allowing you to query the knowledge graph. Example queries and their expected outputs are provided. 128 | 129 | ```python 130 | query_engine = index.as_query_engine(include_text=True) 131 | 132 | response = query_engine.query("what is keras nlp?") 133 | print(str(response)) 134 | 135 | # Output: Keras NLP provides a simple way to fine-tune pre-trained language models for various natural language processing tasks... 136 | ``` 137 | 138 | ```python 139 | response = query_engine.query("format for citing keras nlp") 140 | print(str(response)) 141 | 142 | # Output: To cite Keras NLP, you can refer to the following format: KerasNLP. (n.d.). Retrieved from ... 143 | ``` 144 | -------------------------------------------------------------------------------- /graph_rag/experiments/artifacts/data_keras/index2.md: -------------------------------------------------------------------------------- 1 | # KerasNLP 2 | 3 | Star 4 | 5 | KerasNLP is a natural language processing library that works natively 6 | with TensorFlow, JAX, or PyTorch. Built on Keras 3, these models, layers, 7 | metrics, and tokenizers can be trained and serialized in any framework and 8 | re-used in another without costly migrations. 9 | 10 | KerasNLP supports users through their entire development cycle. Our workflows 11 | are built from modular components that have state-of-the-art preset weights when 12 | used out-of-the-box and are easily customizable when more control is needed. 13 | 14 | This library is an extension of the core Keras API; all high-level modules are 15 | [`Layers`](/api/layers/) or 16 | [`Models`](/api/models/) that receive that same level of polish 17 | as core Keras. If you are familiar with Keras, congratulations! You already 18 | understand most of KerasNLP. 19 | 20 | See our [Getting Started guide](/guides/keras_nlp/getting_started) 21 | to start learning our API. We welcome 22 | [contributions](https://github.com/keras-team/keras-nlp/blob/master/CONTRIBUTING.md). 23 | 24 | --- 25 | ## Quick links 26 | 27 | * [KerasNLP API reference](/api/keras_nlp/) 28 | * [KerasNLP on GitHub](https://github.com/keras-team/keras-nlp) 29 | * [List of available pre-trained models](/api/keras_nlp/models/) 30 | 31 | ## Guides 32 | 33 | * [Getting Started with KerasNLP](/guides/keras_nlp/getting_started/) 34 | * [Uploading Models with KerasNLP](/guides/keras_nlp/upload/) 35 | * [Pretraining a Transformer from scratch](/guides/keras_nlp/transformer_pretraining/) 36 | 37 | ## Examples 38 | 39 | * [GPT-2 text generation](/examples/generative/gpt2_text_generation_with_kerasnlp/) 40 | * [Parameter-efficient fine-tuning of GPT-2 with LoRA](/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora/) 41 | * [Semantic Similarity](/examples/nlp/semantic_similarity_with_keras_nlp/) 42 | * [Sentence embeddings using Siamese RoBERTa-networks](/examples/nlp/sentence_embeddings_with_sbert/) 43 | * [Data Parallel Training with tf.distribute](/examples/nlp/data_parallel_training_with_keras_nlp/) 44 | * [English-to-Spanish translation](/examples/nlp/neural_machine_translation_with_keras_nlp/) 45 | * [GPT text generation from scratch](/examples/generative/text_generation_gpt/) 46 | * [Text Classification using FNet](/examples/nlp/fnet_classification_with_keras_nlp/) 47 | 48 | --- 49 | ## Installation 50 | 51 | KerasNLP supports both Keras 2 and Keras 3. We recommend Keras 3 for all new 52 | users, as it enables using KerasNLP models and layers with JAX, TensorFlow and 53 | PyTorch. 54 | 55 | ### Keras 2 Installation 56 | 57 | To install the latest KerasNLP release with Keras 2, simply run: 58 | 59 | ``` 60 | pip install --upgrade keras-nlp 61 | ``` 62 | 63 | ### Keras 3 Installation 64 | 65 | There are currently two ways to install Keras 3 with KerasNLP. To install the 66 | stable versions of KerasNLP and Keras 3, you should install Keras 3 **after** 67 | installing KerasNLP. This is a temporary step while TensorFlow is pinned to 68 | Keras 2, and will no longer be necessary after TensorFlow 2.16. 69 | 70 | ``` 71 | pip install --upgrade keras-nlp 72 | pip install --upgrade keras 73 | ``` 74 | 75 | To install the latest nightly changes for both KerasNLP and Keras, you can use 76 | our nightly package. 77 | 78 | ``` 79 | pip install --upgrade keras-nlp-nightly 80 | ``` 81 | 82 | **Note:** Keras 3 will not function with TensorFlow 2.14 or earlier. 83 | 84 | See [Getting started with Keras](/getting_started/) for more information on 85 | installing Keras generally and compatibility with different frameworks. 86 | 87 | --- 88 | ## Quickstart 89 | 90 | Fine-tune BERT on a small sentiment analysis task using the 91 | [`keras_nlp.models`](/api/keras_nlp/models/) API: 92 | 93 | ```python 94 | import os 95 | os.environ["KERAS_BACKEND"] = "tensorflow" # Or "jax" or "torch"! 96 | 97 | import keras_nlp 98 | import tensorflow_datasets as tfds 99 | 100 | imdb_train, imdb_test = tfds.load( 101 | "imdb_reviews", 102 | split=["train", "test"], 103 | as_supervised=True, 104 | batch_size=16, 105 | ) 106 | # Load a BERT model. 107 | classifier = keras_nlp.models.BertClassifier.from_preset( 108 | "bert_base_en_uncased", 109 | num_classes=2, 110 | ) 111 | # Fine-tune on IMDb movie reviews. 112 | classifier.fit(imdb_train, validation_data=imdb_test) 113 | # Predict two new examples. 114 | classifier.predict(["What an amazing movie!", "A total waste of my time."]) 115 | ``` 116 | 117 | --- 118 | ## Compatibility 119 | 120 | We follow [Semantic Versioning](https://semver.org/), and plan to 121 | provide backwards compatibility guarantees both for code and saved models built 122 | with our components. While we continue with pre-release `0.y.z` development, we 123 | may break compatibility at any time and APIs should not be consider stable. 124 | 125 | ## Disclaimer 126 | 127 | KerasNLP provides access to pre-trained models via the `keras_nlp.models` API. 128 | These pre-trained models are provided on an "as is" basis, without warranties 129 | or conditions of any kind. The following underlying models are provided by third 130 | parties, and subject to separate licenses: 131 | BART, DeBERTa, DistilBERT, GPT-2, OPT, RoBERTa, Whisper, and XLM-RoBERTa. 132 | 133 | ## Citing KerasNLP 134 | 135 | If KerasNLP helps your research, we appreciate your citations. 136 | Here is the BibTeX entry: 137 | 138 | ```bibtex 139 | @misc{kerasnlp2022, 140 | title={KerasNLP}, 141 | author={Watson, Matthew, and Qian, Chen, and Bischof, Jonathan and Chollet, 142 | Fran\c{c}ois and others}, 143 | year={2022}, 144 | howpublished={\url{https://github.com/keras-team/keras-nlp}}, 145 | } 146 | ``` 147 | -------------------------------------------------------------------------------- /project_explainer/gh_explainer/summarize.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM 3 | from gh_processor import (download_github_repo, 4 | extract_project_description_from_readme, 5 | extract_headings_with_paragraphs_from_markdown, 6 | remove_tables_from_markdown, 7 | remove_code_blocks_from_markdown, 8 | remove_images_from_markdown, 9 | remove_links_from_markdown) 10 | import os 11 | from jinja2 import Template 12 | 13 | 14 | class Explainer(): 15 | def __init__(self, base_model_id: str, device: str = "cpu") -> None: 16 | """ 17 | Initializes the Explainer object. 18 | 19 | Args: 20 | base_model_id: The ID or path to the base model. 21 | device: The device to use for model inference (default is "cpu"). 22 | 23 | Raises: 24 | ValueError: If the provided base model ID or path is invalid. 25 | """ 26 | self.base_model_id = base_model_id 27 | self.device = device 28 | self.tokenizer=AutoTokenizer.from_pretrained(base_model_id) 29 | try: 30 | # support decoder only models 31 | if self.device == "cuda": 32 | self.model=AutoModelForCausalLM.from_pretrained(base_model_id, return_dict=True).to("cuda") 33 | else: 34 | self.model=AutoModelForCausalLM.from_pretrained(base_model_id, return_dict=True) 35 | self.brief_prompt_template = "{{ prompt }}\nExplain the above : " 36 | except Exception as e: 37 | # support encoder decoder models 38 | try: 39 | if self.device == "cuda": 40 | self.model=AutoModelForSeq2SeqLM.from_pretrained(base_model_id, return_dict=True).to("cuda") 41 | else: 42 | self.model=AutoModelForSeq2SeqLM.from_pretrained(base_model_id, return_dict=True) 43 | self.brief_prompt_template = "summarize: {{ prompt }}" 44 | except Exception as e2: 45 | raise ValueError(str(e), str(e2)) 46 | 47 | def _fill_template(self, template_string: str, variables: dict) -> str: 48 | """ 49 | Fills in variables in a template string using the provided dictionary and returns the filled template. 50 | 51 | Args: 52 | template_string: The template string with variables to be filled. 53 | variables: A dictionary containing the variable names and their corresponding values. 54 | 55 | Returns: 56 | The filled template string. 57 | 58 | Raises: 59 | TypeError: If the template_string is not a string or variables is not a dictionary. 60 | """ 61 | template = Template(template_string) 62 | filled_template = template.render(variables) 63 | return filled_template 64 | 65 | def _model_gen(self, prompt: str) -> str: 66 | """ 67 | Generates a response using a hugging face transformer model based on the provided prompt. 68 | 69 | Args: 70 | prompt: The input prompt for generating the response. 71 | 72 | Returns: 73 | The generated response as a string. 74 | 75 | Raises: 76 | TypeError: If the prompt is not a string. 77 | """ 78 | inputs=self.tokenizer.encode(prompt, return_tensors='pt', max_length=1024, truncation=True) 79 | output = self.model.generate(inputs, min_length=256, max_length=512) 80 | return self.tokenizer.decode(output[0], skip_special_tokens=True) 81 | 82 | def brief(self, github_url: str, branch: str = "main") -> dict: 83 | """ 84 | Generates a brief summary of a project based on its README file. 85 | 86 | Args: 87 | github_url: The URL of the GitHub repository. 88 | branch: The branch name to download (default is "main"). 89 | 90 | Returns: 91 | A dictionary containing the original prompt, prepared prompt, and the generated summary. 92 | 93 | Raises: 94 | ValueError: If the README.md file is not found. 95 | """ 96 | repo_path = download_github_repo(github_url, branch) 97 | readme_path = os.path.join(repo_path, "README.md") 98 | if not os.path.exists(readme_path): 99 | raise ValueError("README.md not found") 100 | project_description = extract_project_description_from_readme(readme_path) 101 | prompt = {"prompt": project_description} 102 | prepared_prompt = self._fill_template(self.brief_prompt_template, prompt) 103 | summary=self._model_gen(prepared_prompt) 104 | return {"prompt": prompt, "prepared_prompt": prepared_prompt, "summary": str(summary)} 105 | 106 | def outline(self, github_url: str, branch: str = "main") -> dict: 107 | """ 108 | Generates an outline of a project based on its README file. 109 | 110 | Args: 111 | github_url: The URL of the GitHub repository. 112 | branch: The branch name to download (default is "main"). 113 | 114 | Returns: 115 | A dictionary containing the outline with headings as keys and generated summaries as values. 116 | 117 | Raises: 118 | ValueError: If the README.md file is not found. 119 | """ 120 | repo_path = download_github_repo(github_url, branch) 121 | readme_path = os.path.join(repo_path, "README.md") 122 | if not os.path.exists(readme_path): 123 | raise ValueError("README.md not found") 124 | headings_and_paras = extract_headings_with_paragraphs_from_markdown(readme_path) 125 | outline_dict = {} 126 | for key, value in headings_and_paras.items(): 127 | content = remove_code_blocks_from_markdown(remove_images_from_markdown(remove_links_from_markdown(remove_tables_from_markdown(value)))) 128 | prompt = {"prompt": content} 129 | prepared_prompt = self._fill_template(self.brief_prompt_template, prompt) 130 | outline_dict[key] = self._model_gen(prepared_prompt) 131 | return outline_dict 132 | -------------------------------------------------------------------------------- /graph_rag/evaluation/random/results_5.csv: -------------------------------------------------------------------------------- 1 | question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_precision,context_recall 2 | "What is mixed precision in computing? 3 | ","['Examples\n\n* GPT-2 text generation\n* Parameter-efficient fine-tuning of GPT-2 with LoRA\n* Semantic Similarity\n* Sentence embeddings using Siamese RoBERTa-networks\n* Data Parallel Training with tf.distribute\n* English-to-Spanish translation\n* GPT text generation from scratch\n* Text Classification using FNet\n\n---' 4 | ""Supported hardware\n\nWhile mixed precision will run on most hardware, it will only speed up models on recent NVIDIA GPUs and Google TPUs.\nNVIDIA GPUs support using a mix of float16 and float32, while TPUs support a mix of bfloat16 and float32.\n\nAmong NVIDIA GPUs, those with compute capability 7.0 or higher will see the greatest performance benefit\nfrom mixed precision because they have special hardware units, called Tensor Cores,\nto accelerate float16 matrix multiplications and convolutions. Older GPUs offer no math\nperformance benefit for using mixed precision, however memory and bandwidth savings can enable some speedups.\nYou can look up the compute capability for your GPU at NVIDIA's CUDA GPU web page.\nExamples of GPUs that will benefit most from mixed precision include RTX GPUs, the V100, and the A100.\n\nEven on CPUs and older GPUs, where no speedup is expected, mixed precision APIs can still be used for unit testing,\ndebugging, or just to try out the API. On CPUs, mixed precision will run significantly slower, however.\n\n\nYou can check your GPU type with the following command:\n\n```\nnvidia-smi -L\n```"" 5 | ""The following are knowledge sequence in max depth 2 in the form of directed graph like:\n`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`\n['Mixed precision', 'Will run on most hardware', 'But will only speed up models on recent nvidia gpus and google tpus']""]","Mixed precision is a technique used to improve the efficiency of computer computations by using lower-precision data types (e.g., float16 or bfloat16) for some parts of the computation while still maintaining accuracy. It is commonly used in deep learning and other numerical compute-intensive applications, where memory bandwidth and computational resources are limited. 6 | 7 | While mixed precision will run on most hardware, it will only speed up models on recent NVIDIA GPUs and Google TPUs. Older GPUs offer no math performance benefit for using mixed precision, however memory and bandwidth savings can enable some speedups. You can look up the compute capability for your GPU at NVIDIA's CUDA GPU web page to determine whether it will benefit from mixed precision. Examples of GPUs that will benefit most from mixed precision include RTX GPUs, the V100, and the A100.",A combination of different numerical precisions used to optimize memory usage and computation speed.,0.16666666666666666,0.9818589787934606,0.0,0.6666666666666666 8 | "What is the title of the guide discussed in the given context? 9 | ","['Available guides\n\n{{toc}}' 10 | 'Hyperparameter Tuning\n\nThese guides cover KerasTuner best practices.']","The title of the guide discussed in the given context is ""Hyperparameter Tuning""",How to distribute training,0.0,0.9999999999999996,0.0,1.0 11 | "What is Keras 3? 12 | ",['No relationships found.'],"Keras 3 is a new version of the popular deep learning framework Keras that was released in October 2020. It introduces several new features and improvements over the previous versions, including support for more complex architectures, improved performance on certain tasks, and easier integration with other libraries and frameworks.","A deep learning framework that works with TensorFlow, JAX, and PyTorch interchangeably.",0.0,0.9747114512069875,0.0,0.5 13 | "What was the percentage boost in StableDiffusion training throughput with Keras 3? 14 | ","['A first example: A MNIST convnet\n\nLet\'s start with the Hello World of ML: training a convnet\nto classify MNIST digits.\n\nHere\'s the data:\n\n\n```python\n# Load the data and split it between train and test sets\n(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()\n\n# Scale images to the [0, 1] range\nx_train = x_train.astype(""float32"") / 255\nx_test = x_test.astype(""float32"") / 255\n# Make sure images have shape (28, 28, 1)\nx_train = np.expand_dims(x_train, -1)\nx_test = np.expand_dims(x_test, -1)\nprint(""x_train shape:"", x_train.shape)\nprint(""y_train shape:"", y_train.shape)\nprint(x_train.shape[0], ""train samples"")\nprint(x_test.shape[0], ""test samples"")\n```\n\n\n```\nx_train shape: (60000, 28, 28, 1)\ny_train shape: (60000,)\n60000 train samples\n10000 test samples\n\n```\n\nHere\'s our model.\n\nDifferent model-building options that Keras offers include:\n\n- The Sequential API (what we use below)\n- The Functional API (most typical)\n- Writing your own models yourself via subclassing (for advanced use cases)\n\n\n```python\n# Model parameters\nnum_classes = 10\ninput_shape = (28, 28, 1)\n\nmodel = keras.Sequential(\n [\n keras.layers.Input(shape=input_shape),\n keras.layers.Conv2D(64, kernel_size=(3, 3), activation=""relu""),\n keras.layers.Conv2D(64, kernel_size=(3, 3), activation=""relu""),\n keras.layers.MaxPooling2D(pool_size=(2, 2)),\n keras.layers.Conv2D(128, kernel_size=(3, 3), activation=""relu""),\n keras.layers.Conv2D(128, kernel_size=(3, 3), activation=""relu""),\n keras.layers.GlobalAveragePooling2D(),\n keras.layers.Dropout(0.5),\n keras.layers.Dense(num_classes, activation=""softmax""),\n ]\n)\n```\n\nHere\'s our model summary:\n\n\n```python\nmodel.summary()\n```\n\n\nModel: ""sequential""']","The percentage boost in StableDiffusion training throughput with Keras 3 is not mentioned in the provided context information. Therefore, I cannot answer this question.",Over 150%,0.0,0.9705647972395562,0.9999999999,1.0 15 | "What are some examples of pretrained models available in KerasCV and KerasNLP? 16 | ",['No relationships found.'],"Some examples of pre-trained models available in KerasCV include the InceptionV3 model, the ResNet50 model, and the VGG16 model. Some examples of pre-trained models available in KerasNLP include the BERT model, the RoBERTa model, and the DistilBERT model. These models can be used for a variety of natural language processing tasks such as text classification, sentiment analysis, named entity recognition, and question answering.","BERT, OPT, Whisper, T5, StableDiffusion, YOLOv8, SegmentAnything, etc.",0.0,0.9897694771234743,0.0,0.8571428571428571 17 | -------------------------------------------------------------------------------- /graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script prepares data from a repository for training a P-tuning model using the PEFT library. 3 | It reads source files, processes them into tokenized chunks, and trains a language model using the specified configuration. 4 | 5 | Functions: 6 | - prepare_data: Collects files from a repository, concatenates their content, and saves it to an output file. 7 | - data_for_training: Tokenizes the concatenated content and prepares it for language model training. 8 | - get_peft_model: Initializes and configures a P-tuning model using the specified configuration. 9 | - create_training_arguments: Generates training arguments for the Trainer using the configuration settings. 10 | - create_trainer: Creates a Trainer object with the model, data, and training arguments. 11 | - main: Parses the YAML configuration file and runs the training process. 12 | 13 | Requirements: 14 | - A YAML configuration file that specifies model, training, and data parameters. 15 | """ 16 | 17 | import argparse 18 | import yaml 19 | import os 20 | import glob 21 | from datasets import Dataset 22 | from transformers import Trainer, DataCollatorForLanguageModeling 23 | from transformers import AutoModelForCausalLM, AutoTokenizer 24 | from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit 25 | from transformers import TrainingArguments 26 | 27 | 28 | def prepare_data(repo_path: str, extensions: list, output_file: str): 29 | """ 30 | Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file. 31 | 32 | Args: 33 | repo_path: Path to the repository to collect files from. 34 | extensions: List of file extensions to include in the data preparation. 35 | output_file: Path to the output file where the concatenated content will be saved. 36 | 37 | Returns: 38 | A string containing the entire content written to the output file. 39 | """ 40 | 41 | files = [] 42 | for ext in extensions: 43 | files.extend( 44 | glob.glob(os.path.join(repo_path, "**", f"*.{ext}"), recursive=True) 45 | ) 46 | 47 | with open(output_file, "w", encoding="utf-8") as outfile: 48 | for path in files: 49 | with open(path, "r", encoding="utf-8") as file: 50 | content = file.read() 51 | outfile.write(f"### {path} ###\n") 52 | outfile.write(content) 53 | outfile.write("\n\n") 54 | 55 | with open(output_file, "r") as f: 56 | return f.read() 57 | 58 | 59 | def data_for_training(content: str, config: dict): 60 | """ 61 | Tokenizes the content and prepares it for language model training, including creating a data collator. 62 | 63 | Args: 64 | content: The concatenated text content to be tokenized. 65 | config: Dictionary containing the model and training configuration. 66 | 67 | Returns: 68 | A tuple containing the tokenized dataset and the data collator for language model training. 69 | """ 70 | 71 | tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"]) 72 | context_length = config["Model"]["context_length"] 73 | outputs = tokenizer( 74 | content, 75 | truncation=True, 76 | max_length=context_length, 77 | return_overflowing_tokens=True, 78 | return_length=True, 79 | ) 80 | print(f"Input IDs length: {len(outputs['input_ids'])}") 81 | print(f"Input chunk lengths: {outputs['length']}") 82 | print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}") 83 | ds = Dataset.from_dict(outputs) 84 | ds_removed = ds.remove_columns( 85 | ["attention_mask", "length", "overflow_to_sample_mapping"] 86 | ) 87 | tokenizer.pad_token = tokenizer.eos_token 88 | data_collator = DataCollatorForLanguageModeling( 89 | tokenizer, mlm=config["Training"]["masked_language_modelling"] 90 | ) 91 | return ds_removed, data_collator 92 | 93 | 94 | def get_peft_model(config: dict): 95 | """ 96 | Initializes and configures a P-tuning model using the specified foundational model and prompt tuning configuration. 97 | 98 | Args: 99 | config: Dictionary containing the model and training configuration. 100 | 101 | Returns: 102 | A P-tuned model ready for training. 103 | """ 104 | 105 | foundational_model = AutoModelForCausalLM.from_pretrained( 106 | config["Model"]["model"], trust_remote_code=True 107 | ) 108 | generation_config = PromptTuningConfig( 109 | task_type=TaskType.CAUSAL_LM, 110 | prompt_tuning_init=PromptTuningInit.RANDOM, 111 | num_virtual_tokens=config["Training"]["num_virtual_tokens"], 112 | tokenizer_name_or_path=config["Model"]["model"], 113 | ) 114 | peft_model_prompt = get_peft_model(foundational_model, generation_config) 115 | peft_model_prompt.print_trainable_parameters() 116 | return peft_model_prompt 117 | 118 | 119 | def create_training_arguments(config: dict): 120 | """ 121 | Creates and configures the training arguments for the Trainer object. 122 | 123 | Args: 124 | config: Dictionary containing the training configuration. 125 | 126 | Returns: 127 | A TrainingArguments object with the specified settings. 128 | """ 129 | 130 | training_args = TrainingArguments( 131 | output_dir=config["Training"]["output_dir"], 132 | save_strategy="steps", 133 | per_device_train_batch_size=config["Training"]["batch_size"], 134 | auto_find_batch_size=config["Training"]["auto_batch_size"], 135 | learning_rate=config["Training"]["learning_rate"], 136 | num_train_epochs=config["Training"]["num_epochs"], 137 | push_to_hub=config["Training"]["push_to_hub"], 138 | ) 139 | return training_args 140 | 141 | 142 | def create_trainer( 143 | config: dict, train_data: object, data_collator: object, model: object 144 | ): 145 | """ 146 | Creates a Trainer object for training the model with the provided data and configuration. 147 | 148 | Args: 149 | config: Dictionary containing the training configuration. 150 | train_data: The tokenized dataset to be used for training hf Dataset object. 151 | data_collator: The data collator for handling the tokenized data during training. 152 | model: The P-tuned model to be trained. 153 | 154 | Returns: 155 | A Trainer object configured for training the model. 156 | """ 157 | 158 | training_args = create_training_arguments(config) 159 | trainer = Trainer( 160 | model=model, 161 | args=training_args, 162 | train_dataset=train_data, 163 | data_collator=data_collator, 164 | ) 165 | return trainer 166 | 167 | 168 | def main(): 169 | """ 170 | Main function to execute the training pipeline. It parses the YAML configuration file, prepares the data, initializes 171 | the model, and starts the training process. 172 | """ 173 | parser = argparse.ArgumentParser(description="Training script for P-tuning model") 174 | parser.add_argument( 175 | "--config", type=str, required=True, help="Path to the YAML configuration file" 176 | ) 177 | args = parser.parse_args() 178 | 179 | with open(args.config, "r") as file: 180 | config = yaml.safe_load(file) 181 | 182 | content = prepare_data( 183 | config["Data"]["repo_path"], 184 | config["Data"]["extensions"], 185 | config["Data"]["output_file"], 186 | ) 187 | 188 | train_data, data_collator = data_for_training(content, config) 189 | model = get_peft_model(config) 190 | trainer = create_trainer(config, train_data, data_collator, model) 191 | 192 | trainer.train() 193 | 194 | 195 | if __name__ == "__main__": 196 | main() 197 | -------------------------------------------------------------------------------- /graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script facilitates the fine-tuning of a language model using QLoRA (Quantized Low-Rank Adapter) 3 | adapter tuning. 4 | 5 | The main functionalities include: 6 | - Preparing data from a specified repository with specific file extensions. 7 | - Tokenizing the data for model training. 8 | - Loading and configuring a pre-trained language model. 9 | - Applying PEFT (Parameter-Efficient Fine-Tuning) using QLoRA. 10 | - Defining training arguments and creating a Trainer instance. 11 | - Executing the training process with the Trainer. 12 | 13 | Requirements: 14 | - A YAML configuration file that specifies model, training, and data parameters. 15 | """ 16 | 17 | import argparse 18 | import yaml 19 | import os 20 | import glob 21 | import torch 22 | from datasets import Dataset 23 | from transformers import Trainer, DataCollatorForLanguageModeling 24 | from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training 25 | from transformers import ( 26 | AutoModelForCausalLM, 27 | AutoTokenizer, 28 | Trainer, 29 | TrainingArguments, 30 | BitsAndBytesConfig, 31 | ) 32 | 33 | 34 | def prepare_data(repo_path: str, extensions: list, output_file: str): 35 | """ 36 | Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file. 37 | 38 | Args: 39 | repo_path: Path to the repository to collect files from. 40 | extensions: List of file extensions to include in the data preparation. 41 | output_file: Path to the output file where the concatenated content will be saved. 42 | 43 | Returns: 44 | A string containing the entire content written to the output file. 45 | """ 46 | 47 | files = [] 48 | for ext in extensions: 49 | files.extend( 50 | glob.glob(os.path.join(repo_path, "**", f"*.{ext}"), recursive=True) 51 | ) 52 | 53 | with open(output_file, "w", encoding="utf-8") as outfile: 54 | for path in files: 55 | with open(path, "r", encoding="utf-8") as file: 56 | content = file.read() 57 | outfile.write(f"### {path} ###\n") 58 | outfile.write(content) 59 | outfile.write("\n\n") 60 | 61 | with open(output_file, "r") as f: 62 | return f.read() 63 | 64 | 65 | def data_for_training(content: str, config: dict): 66 | """ 67 | Tokenizes the content and prepares it for language model training, including creating a data collator. 68 | 69 | Args: 70 | content: The concatenated text content to be tokenized. 71 | config: Dictionary containing the model and training configuration. 72 | 73 | Returns: 74 | A tuple containing the tokenized dataset,tokenizer,data collator for language model training. 75 | """ 76 | tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"]) 77 | context_length = config["Model"]["context_length"] 78 | outputs = tokenizer( 79 | content, 80 | truncation=True, 81 | max_length=context_length, 82 | return_overflowing_tokens=True, 83 | return_length=True, 84 | ) 85 | print(f"Input IDs length: {len(outputs['input_ids'])}") 86 | print(f"Input chunk lengths: {outputs['length']}") 87 | print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}") 88 | ds = Dataset.from_dict(outputs) 89 | ds_removed = ds.remove_columns( 90 | ["attention_mask", "length", "overflow_to_sample_mapping"] 91 | ) 92 | tokenizer.pad_token = tokenizer.eos_token 93 | data_collator = DataCollatorForLanguageModeling( 94 | tokenizer, mlm=config["Training"]["masked_language_modelling"] 95 | ) 96 | return ds_removed, data_collator, tokenizer 97 | 98 | 99 | def load_base_model(config: dict): 100 | """ 101 | Loads the base language model with specified configurations, including quantization settings. 102 | 103 | Args: 104 | config: The configuration dictionary containing model and BNB (BitsAndBytes) parameters. 105 | 106 | Returns: 107 | PreTrainedModel: The loaded pre-trained language model ready for training. 108 | """ 109 | 110 | compute_dtype = getattr(torch, config["BNB_CONFIG"]["BNB_4BIT_COMPUTE_DTYPE"]) 111 | 112 | bnb_config = BitsAndBytesConfig( 113 | load_in_4bit=True, 114 | bnb_4bit_quant_type="nf4", 115 | bnb_4bit_compute_dtype=compute_dtype, 116 | bnb_4bit_use_double_quant=config["BNB_CONFIG"]["USE_NESTED_QUANT"], 117 | ) 118 | device_map = {"": 0} 119 | 120 | model = AutoModelForCausalLM.from_pretrained( 121 | config["MODEL"]["MODEL"], 122 | load_in_8bit=config["MODEL"]["LOAD_IN_8BIT"], 123 | quantization_config=bnb_config, 124 | device_map=device_map, 125 | use_cache=False, 126 | trust_remote_code=True, 127 | ) 128 | return model 129 | 130 | 131 | def load_peft_model(model: object, config: dict): 132 | """ 133 | Applies PEFT (Parameter-Efficient Fine-Tuning) using QLoRA to the given model. 134 | 135 | Args: 136 | model: The pre-trained language model to be fine-tuned. 137 | config: The configuration dictionary containing LORA (Low-Rank Adapter) parameters. 138 | 139 | Returns: 140 | PreTrainedModel: The PEFT-configured model ready for training. 141 | """ 142 | 143 | model = prepare_model_for_kbit_training(model) 144 | peft_config = LoraConfig( 145 | lora_alpha=config["LORA"]["LORA_ALPHA"], 146 | lora_dropout=config["LORA"]["LORA_DROPOUT"], 147 | r=config["LORA"]["LORA_R"], 148 | bias="none", 149 | task_type="CAUSAL_LM", 150 | # target_modules=, 151 | ) 152 | model = get_peft_model(model, peft_config) 153 | model.print_trainable_parameters() 154 | return model 155 | 156 | 157 | def create_training_arguments(config: dict): 158 | """ 159 | Creates and returns the training arguments for the Trainer. 160 | 161 | Args: 162 | config: The configuration dictionary containing training arguments. 163 | 164 | Returns: 165 | TrainingArguments: The configured training arguments. 166 | """ 167 | 168 | training_args = TrainingArguments( 169 | output_dir=f"results/{config['TRAINING_ARGUMENTS']['OUTPUT_DIR']}", 170 | num_train_epochs=3, 171 | dataloader_drop_last=True, 172 | evaluation_strategy="steps", 173 | save_strategy="steps", 174 | eval_steps=config["TRAINING_ARGUMENTS"]["EVAL_FREQ"], 175 | save_steps=config["TRAINING_ARGUMENTS"]["SAVE_FREQ"], 176 | logging_steps=config["TRAINING_ARGUMENTS"]["LOG_FREQ"], 177 | per_device_train_batch_size=64, 178 | per_device_eval_batch_size=64, 179 | learning_rate=config["TRAINING_ARGUMENTS"]["LR"], 180 | lr_scheduler_type=config["TRAINING_ARGUMENTS"]["LR_SCHEDULER_TYPE"], 181 | warmup_steps=config["TRAINING_ARGUMENTS"]["NUM_WARMUP_STEPS"], 182 | gradient_accumulation_steps=config["TRAINING_ARGUMENTS"]["GR_ACC_STEPS"], 183 | gradient_checkpointing=True, 184 | fp16=config["TRAINING_ARGUMENTS"]["FP16"], 185 | bf16=config["TRAINING_ARGUMENTS"]["BF16"], 186 | weight_decay=config["TRAINING_ARGUMENTS"]["WEIGHT_DECAY"], 187 | # push_to_hub=True, 188 | include_tokens_per_second=True, 189 | ) 190 | return training_args 191 | 192 | 193 | def create_trainer( 194 | tokenizer: object, train_data: object, data_collator: object, model: object 195 | ): 196 | """ 197 | Creates a Trainer instance with the provided tokenizer, training data, data collator, and model. 198 | 199 | Args: 200 | tokenizer: The tokenizer to be used during training. 201 | train_data : The tokenized training dataset. 202 | data_collator: The data collator for language modeling. 203 | model : The pre-trained and fine-tuned model. 204 | 205 | Returns: 206 | Trainer: The Trainer instance for model training. 207 | """ 208 | training_args = create_training_arguments() 209 | trainer = Trainer( 210 | model=model, 211 | tokenizer=tokenizer, 212 | args=training_args, 213 | data_collator=data_collator, 214 | train_dataset=train_data, 215 | eval_dataset=train_data, 216 | ) 217 | return trainer 218 | 219 | 220 | def main(): 221 | """ 222 | The main function that orchestrates the data preparation, model loading, 223 | and training processes using the provided YAML configuration. 224 | """ 225 | 226 | parser = argparse.ArgumentParser( 227 | description="Training script for QLoRA adapter tuning" 228 | ) 229 | parser.add_argument( 230 | "--config", type=str, required=True, help="Path to the YAML configuration file" 231 | ) 232 | args = parser.parse_args() 233 | 234 | with open(args.config, "r") as file: 235 | config = yaml.safe_load(file) 236 | 237 | content = prepare_data( 238 | config["Data"]["repo_path"], 239 | config["Data"]["extensions"], 240 | config["Data"]["output_file"], 241 | ) 242 | 243 | train_data, data_collator, tokenizer = data_for_training(content, config) 244 | model = load_base_model(config) 245 | model = load_peft_model(model, config) 246 | trainer = create_trainer(config, tokenizer, train_data, data_collator, model) 247 | 248 | trainer.train() 249 | 250 | 251 | if __name__ == "__main__": 252 | main() 253 | -------------------------------------------------------------------------------- /graph_rag/evaluation/README.MD: -------------------------------------------------------------------------------- 1 | 2 | # Knowledge Graph Evaluation 3 | 4 | This module provides methods to evaluate the performance of GraphRag. The following integrations are available for evaluation: 5 | 6 | - **Llama-Index Evaluation Pack** 7 | - **Ragas Evaluation Pack** 8 | 9 | Additionally, this module includes scripts for creating custom test datasets to benchmark and evaluate GraphRag. 10 | 11 | ## Getting Started 12 | This section demonstrates how to use the functions provided in the module: 13 | 14 | --- 15 | 16 | ### 1. QA Generation and Critique 17 | 18 | This module offers tools to generate question-answer (QA) pairs from input documents using a language model and critique them based on various criteria like groundedness, relevance, and standalone quality. 19 | 20 | > #### Generate and Critique QA Pairs 21 | 22 | To use this module, follow these steps: 23 | 24 | #### 1. Generate QA Pairs 25 | 26 | First, we prepare our dataset for generating QA pairs. In this example, we'll use Keras-IO documentation and Llama-Index's `SimpleDirectoryReader` to obtain `Document` objects. 27 | 28 | ```python 29 | !git clone https://github.com/keras-team/keras-io.git 30 | 31 | def get_data(input_dir="path/to/keras-io/templates"): 32 | reader = SimpleDirectoryReader( 33 | input_dir, 34 | recursive=True, 35 | exclude=["path/to/keras-io/templates/examples"] 36 | ) 37 | docs = reader.load_data() 38 | 39 | splitter = SentenceSplitter( 40 | chunk_size=300, 41 | chunk_overlap=20, 42 | ) 43 | nodes = splitter.get_nodes_from_documents(docs) 44 | documents = [Document(text=node.text, metadata=node.metadata) for node in nodes] 45 | 46 | return docs 47 | 48 | # load the documents 49 | documents=get_data() 50 | ``` 51 | 52 | Use the `qa_generator` function to generate QA pairs from your input documents. 53 | 54 | ```python 55 | from evaluation.ragas_evaluation.QA_graphrag_testdataset import qa_generator 56 | 57 | N_GENERATIONS = 20 58 | 59 | # Generate the QA pairs 60 | qa_pairs = qa_generator(documents, N_GENERATIONS) 61 | ``` 62 | 63 | #### 2. Critique the Generated QA Pairs 64 | 65 | Once you have generated the QA pairs, critique them using the `critique_qa` function. 66 | 67 | ```python 68 | from evaluation.ragas_evaluation.QA_graphrag_testdatasete import critique_qa 69 | 70 | # Critique the generated QA pairs 71 | critiqued_qa_pairs = critique_qa(qa_pairs) 72 | 73 | # The critiqued pairs will include scores and evaluations for groundedness, relevance, and standalone quality 74 | ``` 75 | 76 | --- 77 | ### 2. Evaluating Your Knowledge Graph with Llama-Index Evaluator Pack 78 | 79 | This section demonstrates how to evaluate the performance of your query engine using the Llama-Index RAG evaluator pack. 80 | 81 | > #### Evaluate Your Knowledge Graph with llama-index 82 | 83 | To evaluate your query engine, follow these steps: 84 | ```shell 85 | llamaindex-cli download-llamadataset PaulGrahamEssayDataset --download-dir ./data 86 | ``` 87 | 88 | ```python 89 | from evaluation.evaluation_llama_index import evaluate 90 | 91 | 92 | # Path to your labeled RAG dataset 93 | RAG_DATASET = "./data/rag_dataset.json" 94 | 95 | # Define the language model and embedding 96 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 97 | from llama_index.llms.ollama import Ollama 98 | 99 | llm = Ollama(base_url="http://localhost:11434", model="llama2") 100 | embedding = HuggingFaceEmbedding(model_name="microsoft/codebert-base") 101 | 102 | # Your query engine instance 103 | from graph_rag.graph_retrieval.graph_retrieval import get_index_from_pickle, get_query_engine 104 | 105 | index = get_index_from_pickle("path/to/graphIndex.pkl") 106 | query_engine = get_query_engine(index) 107 | 108 | # Evaluate the dataset 109 | evaluation_results = evaluate(RAG_DATASET, query_engine) 110 | 111 | # Review the results 112 | print(evaluation_results) 113 | ``` 114 | | Metrics | RAG | Base RAG | 115 | |------------------------------|------------|-----------| 116 | | **Mean Correctness Score** | 3.340909 | 0.934 | 117 | | **Mean Relevancy Score** | 0.750000 | 4.239 | 118 | | **Mean Faithfulness Score** | 0.386364 | 0.977 | 119 | | **Mean Context Similarity Score** | 0.948765 | 0.977 | 120 | 121 | 122 | 123 | This example shows how to quickly evaluate your query engine's performance using the Llama-Index RAG evaluator pack. 124 | 125 | 126 | --- 127 | ### 3. Evaluating Your Knowledge Graph with Ragas backend 128 | 129 | You can easily evaluate the performance of your query engine using this module. 130 | 131 | > #### Load and Evaluate Your Dataset with ragas 132 | 133 | Use the `load_test_dataset` function to load your dataset and directly evaluate it using the `evaluate` function. This method handles all necessary steps, including batching the data. 134 | 135 | ```python 136 | from evaluation.ragas_evaluation.evaluation_ragas load_test_dataset, evaluate 137 | 138 | # Step 1: Load the dataset from a pickle file 139 | dataset_path = "/content/keras_docs_embedded.pkl" 140 | test_dataset = load_test_dataset(dataset_path) 141 | ``` 142 | 143 | > **Note:** `test_dataset` is a list of Llama-Index `Document` objects. 144 | 145 | ```python 146 | # Step 2: Define the language model and embedding 147 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 148 | from llama_index.llms.ollama import Ollama 149 | 150 | llm = Ollama(base_url="http://localhost:11434", model="codellama") 151 | embedding = HuggingFaceEmbedding(model_name="microsoft/codebert-base") 152 | 153 | # Step 3: Specify the metrics for evaluation 154 | metrics = [faithfulness, answer_relevancy, context_precision, context_recall] 155 | 156 | # Step 4: Load the query engine (Llama-Index) 157 | from graph_rag.graph_retrieval.graph_retrieval import get_index_from_pickle, get_query_engine 158 | 159 | index = get_index_from_pickle("path/to/graphIndex.pkl") 160 | query_engine = get_query_engine(index) 161 | 162 | # Step 5: Evaluate the dataset 163 | evaluation_results = evaluate( 164 | query_engine=query_engine, 165 | dataset=test_dataset, 166 | llm=llm, 167 | embeddings=embedding, 168 | metrics=metrics, 169 | # Default batch size is 4 170 | ) 171 | ``` 172 | 173 | **Output:** 174 | ```python 175 | {'faithfulness': 0.0333, 'answer_relevancy': 0.9834, 'context_precision': 0.2000, 'context_recall': 0.8048} 176 | ``` 177 | 178 | ```python 179 | rdf = evaluation_results.to_pandas() 180 | rdf.to_csv("results.csv", index=False) 181 | ``` 182 | --- 183 | **Detailed Result:** 184 | 185 | | question | contexts | answer | ground_truth | faithfulness | answer_relevancy | context_precision | context_recall | 186 | |-----------------------------------------------|---------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------|--------------|------------------|-------------------|----------------| 187 | | What is mixed precision in computing? | [Examples GPT-2 text generation Parameter…] | Mixed precision is a technique used to improve… | A combination of different numerical precision… | 0.166667 | 0.981859 | 0.0 | 0.666667 | 188 | | What is the title of the guide discussed in th... | [Available guides… Hyperparameter T…] | The title of the guide discussed in the given… | How to distribute training | 0.000000 | 1.000000 | 0.0 | 1.000000 | 189 | | What is Keras 3? | [No relationships found.] | Keras 3 is a new version of the popular deep l… | A deep learning framework that works with Tensor… | 0.000000 | 0.974711 | 0.0 | 0.500000 | 190 | | What was the percentage boost in StableDiffusion... | [A first example: A MNIST convnet…] | The percentage boost in StableDiffusion traini… | Over 150% | 0.000000 | 0.970565 | 1.0 | 1.000000 | 191 | | What are some examples of pretrained models av... | [No relationships found.] | Some examples of pre-trained models available… | BERT, OPT, Whisper, T5, StableDiffusion, YOLOv8… | 0.000000 | 0.989769 | 0.0 | 0.857143 | 192 | 193 | 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /experiments/experiment_t5_abs_summarization/experiment_t5_abs_summarization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4" 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "execution_count": 11, 22 | "metadata": { 23 | "colab": { 24 | "base_uri": "https://localhost:8080/" 25 | }, 26 | "id": "a1J6mrksgO0w", 27 | "outputId": "fc730428-d183-4957-c81a-3c9876a248d3" 28 | }, 29 | "outputs": [ 30 | { 31 | "output_type": "stream", 32 | "name": "stdout", 33 | "text": [ 34 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 35 | "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n", 36 | "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.29.2)\n", 37 | "Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (1.26.15)\n", 38 | "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.12.0)\n", 39 | "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n", 40 | "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n", 41 | "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n", 42 | "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n", 43 | "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n", 44 | "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n", 45 | "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.5)\n", 46 | "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.1)\n", 47 | "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n", 48 | "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", 49 | "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n", 50 | "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", 51 | "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", 52 | "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n", 53 | "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n", 54 | "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.4.0)\n", 55 | "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2)\n", 56 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n", 57 | "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n", 58 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", 59 | "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "!pip install torch transformers urllib3" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "source": [ 70 | "import torch\n", 71 | "from transformers import AutoTokenizer, AutoModelWithLMHead\n", 72 | "import urllib3" 73 | ], 74 | "metadata": { 75 | "id": "DTeMo878grDl" 76 | }, 77 | "execution_count": 12, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "source": [ 83 | "tokenizer=AutoTokenizer.from_pretrained('T5-base')\n", 84 | "model=AutoModelWithLMHead.from_pretrained('T5-base', return_dict=True)" 85 | ], 86 | "metadata": { 87 | "colab": { 88 | "base_uri": "https://localhost:8080/" 89 | }, 90 | "id": "sgRtK1S5gwy5", 91 | "outputId": "bc79d4b9-85c8-43f3-ce81-db56f0fda82e" 92 | }, 93 | "execution_count": 13, 94 | "outputs": [ 95 | { 96 | "output_type": "stream", 97 | "name": "stderr", 98 | "text": [ 99 | "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/modeling_auto.py:1352: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", 100 | " warnings.warn(\n" 101 | ] 102 | } 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "source": [ 108 | "git_repo = input('git_repo url : ')\n", 109 | "git_repo = git_repo.replace(\"github.com\", \"raw.githubusercontent.com\")\n", 110 | "git_repo = git_repo + \"/master/README.md\"\n", 111 | "\n", 112 | "http = urllib3.PoolManager()\n", 113 | "r = http.request('GET', git_repo, preload_content=False)\n", 114 | "prompt = str(r.read()) + \"\\nsummarize: \"\n", 115 | "prompt = str(prompt)\n", 116 | "r.release_conn()\n" 117 | ], 118 | "metadata": { 119 | "colab": { 120 | "base_uri": "https://localhost:8080/" 121 | }, 122 | "id": "u5uVhkdtg1_H", 123 | "outputId": "9e2bb214-8e05-4d12-8bff-41616ccd4e61" 124 | }, 125 | "execution_count": 54, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "git_repo url : https://github.com/kikinteractive/app\n" 132 | ] 133 | } 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "source": [ 139 | "inputs=tokenizer.encode(prompt, return_tensors='pt', max_length=1024, truncation=True)" 140 | ], 141 | "metadata": { 142 | "id": "-0vT8Ychi2_9" 143 | }, 144 | "execution_count": 55, 145 | "outputs": [] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "source": [ 150 | "output = model.generate(inputs, min_length=256, max_length=512)\n" 151 | ], 152 | "metadata": { 153 | "id": "hTylPPZDhSMR" 154 | }, 155 | "execution_count": 56, 156 | "outputs": [] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "source": [ 161 | "summary=tokenizer.decode(output[0], skip_special_tokens=True)\n", 162 | "\n", 163 | "print(\"README : \")\n", 164 | "print(str(prompt))\n", 165 | "\n", 166 | "print(\"Summary : \")\n", 167 | "print(summary)" 168 | ], 169 | "metadata": { 170 | "colab": { 171 | "base_uri": "https://localhost:8080/" 172 | }, 173 | "id": "D83RWVvihU2e", 174 | "outputId": "b7073088-8cde-4833-b3ee-21286c2b1781" 175 | }, 176 | "execution_count": 57, 177 | "outputs": [ 178 | { 179 | "output_type": "stream", 180 | "name": "stdout", 181 | "text": [ 182 | "README : \n", 183 | "b'app.js - mobile webapps made easy\\n=================================\\n\\nApp.js is a lightweight JavaScript UI library for creating mobile webapps that behave like native apps, sacrificing neither performance nor polish.\\n\\n* cross-platform (Android 2.2+, iOS 4.3+)\\n* themable platform-specific UI designs\\n* configurable native-like transitions\\n* automatically managed navigation stack\\n* built-in widgets for general use-cases\\n\\nThe goal of App.js is to provide a robust starting point for mobile webapps, handling general scenarios, and maintaining compatiblity with other common JavaScript libraries.\\n\\n\\n##[Check out the documentation here](http://code.kik.com/app/)\\n\\n\\n##[Changelog](/CHANGELOG.md)\\n'\n", 184 | "summarize: \n", 185 | "Summary : \n", 186 | "b'app.js is a lightweight JavaScript UI library for creating mobile webapps that behave like native apps. it's designed to provide a robust starting point for mobile webapps, handling general scenarios, and maintaining compatiblity with other common JavaScript libraries. it's also a great way to create custom mobile webapps, without having to sacrifice performance or polish. b'app.js is available for ios and android 2.2+, and is .........................\n" 187 | ] 188 | } 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "source": [], 194 | "metadata": { 195 | "id": "ed7TqUT4hqF-" 196 | }, 197 | "execution_count": 40, 198 | "outputs": [] 199 | } 200 | ] 201 | } -------------------------------------------------------------------------------- /graph_rag/experiments/EXPERIMENTS.MD: -------------------------------------------------------------------------------- 1 | # Experiments 2 | 3 | The major portion of my time in the first phase of the GSoC project has been spent experimenting with different models, embeddings, and libraries. 4 | 5 | ## Knowledge Graph from Documentation 6 | 7 | The majority of the documentation for libraries is stored in the form of HTML and markdown files in their GitHub repositories. 8 | 9 | We first used llama-index document loaders to load all documents with the .md extension. We then performed chunking and created a Document instance of them. 10 | 11 | ## Knowledge Graph Using Code Embeddings 12 | 13 | Implementation of the idea can be found here: [Colab](https://colab.research.google.com/drive/1uguR76SeMAukN4uAhKuXU_ja8Ik0s8Wj#scrollTo=CUgtX5D1Tl_x). 14 | 15 | The idea is to separate code blocks or take code and split it using a code splitter, then pass it to a model for building a Knowledge Graph using code embeddings. I used: 16 | - Salesforce/codegen2-7B_P quantized (4-bit) 17 | - Salesforce/codet5p-110m-embedding 18 | - Python files in Keras-io 19 | 20 | ### Model Selection 21 | 22 | We need a model that is open source and can work on the free Colab version to begin with. For a better knowledge graph, we quantized models above 20GB to 4 bits using bitsandbytes configuration. We tried the following LLMs: 23 | - gemini pro 24 | - [QuantiPhy/zephyr-7b-beta(4bit-quantized)**](https://huggingface.co/QuantiPhy/zephyr-7b-beta-4bit-quantized) 25 | - llama3 (Ollama version) 26 | - codellama (Ollama version) 27 | - [QuantiPhy/aya-23-8B (4bit quantized)**](https://huggingface.co/QuantiPhy/aya-23-8B-4bq) 28 | - gpt-neo-2.7B(4bit-quantized) 29 | - [Salesforce/codegen2-7B_P(4bit-quantized)**](https://huggingface.co/QuantiPhy/Salesforce_codegen2-7B_P) 30 | - phi3 (Ollama) 31 | - phi3:medium (Ollama) 32 | - neural-chat (Ollama) 33 | - gemma2 (Ollama) 34 | - mistral (Ollama) 35 | ** all these models,I have 4bit-quantized them using bitsandbytes 36 | ### Embeddings 37 | 38 | For embeddings, we tried: 39 | - microsoft/codebert-base 40 | - Salesforce/codet5p-110m-embedding 41 | 42 | ### Libraries 43 | 44 | In the initial phase, we are looking for libraries in the community that solve the problem of building Knowledge Graphs: 45 | - [llama-index knowledge-graph builder](https://github.com/run-llama/llama_index/tree/main/llama-index-core/llama_index/core/indices/knowledge_graph) 46 | - [llm-graph-builder](https://github.com/neo4j-labs/llm-graph-builder) 47 | - [graph_builder](https://github.com/sarthakrastogi/graph-rag) 48 | 49 | ### Table 50 | 51 | | Model | Embeddings | Libraries | Remarks | Documents | Artifacts | 52 | |:----------------------------|:---------------------|:---------------------------|:------------|:-------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 53 | | gemma2 (Ollama) | microsoft/codebert-base | llama-index graph builder | nil | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/gemma2/Graph_visualization_gemma2_mscb.html)
[index](artifacts/gemma2/gemma2graphIndex.pkl)
[collab](https://colab.research.google.com/drive/1q7FED2Lapk3D7ibqkO3NkqZ6iPNZ_x6H?usp=sharing) | 54 | | mistral (Ollama) | microsoft/codebert-base | llama-index graph builder | nil | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/mistral/Graph_visualization_mistral_mscb.html)
[index](artifacts/mistral/mistralgraphIndex.pkl)
[collab](https://colab.research.google.com/drive/1q7FED2Lapk3D7ibqkO3NkqZ6iPNZ_x6H?usp=sharing) | 55 | | neural-chat (Ollama) | microsoft/codebert-base | llama-index graph builder | nil | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/neural_chat/Graph_visualization_neuralchat_mscb.html)
[index](artifacts/neural_chat/graphIndex_neuralchat_mscb.pkl)
[collab](https://colab.research.google.com/drive/1cM6ujhiKM1v0bRYVN9F9UEgjYlwkBTt9?usp=sharing) | 56 | | phi3:medium (Ollama) | microsoft/codebert-base | llama-index graph builder | nil | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/phi3-med/Graph_visualization_phi3-med_mscb.html)
[index](artifacts/phi3-med/graphIndex_phi3_medium_mscb.pkl)
[collab](https://colab.research.google.com/drive/1cM6ujhiKM1v0bRYVN9F9UEgjYlwkBTt9?usp=sharing) | 57 | | phi3 (Ollama) | microsoft/codebert-base | llama-index graph builder | nil | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/phi3/Graph_visualization_phi3_mscb.html)
[index](artifacts/phi3/graphIndex_phi3_mscb.pkl)
[collab](https://colab.research.google.com/drive/1cM6ujhiKM1v0bRYVN9F9UEgjYlwkBTt9?usp=sharing) | 58 | | gpt-4o | open-ai | Neo4jGraphBuilder | nil | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/vizualization/visualisation.png) | 59 | | Gemini | gemini | llama-index graph builder | nil | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/ex1.html) | 60 | | Gemini | gemini | llama-index graph builder | Rate-error | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | | 61 | | Gemini | microsoft/codebert-base | llama-index graph builder | nil | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/gem_mcode_k_nlp.html) | 62 | | Zypher (4-bit) | microsoft/codebert-base | llama-index graph builder | nil | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/zy_knlp.html) | 63 | | Zypher (4-bit) | microsoft/codebert-base | llama-index graph builder | nil | [keras-io](https://github.com/keras-team/keras-io/tree/master/templates) | [viz](artifacts/vizualization/examp.html) | 64 | | llama3 (Ollama version) | microsoft/codebert-base | llama-index graph builder | nil | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/Graph_visualization.html) | 65 | | codellama (Ollama version) | microsoft/codebert-base | llama-index graph builder | nil | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/code_1.html) | 66 | | gpt-neo-2.7B-4bit-quantized | microsoft/codebert-base | llama-index graph builder | nil | [keras-nlp](https://github.com/keras-team/keras-io/blob/master/templates/keras_nlp/index.md) | [viz](artifacts/vizualization/graph_gpt3-neo.html) | 67 | 68 | ### Notes 69 | - ### [graph_builder](https://github.com/sarthakrastogi/graph-rag) 70 | 71 | - I explored graph_rag by Sarthak. It is fundamentally based on function calling (JSON output), and it works very well for powerful models. However, small-sized LLMs tend to make mistakes regardless of how well the prompt is crafted. 72 | - I tried and debugged the library, and this was my experience with it. I modified the system prompts, which led to fewer mistakes, and added a method to download .html files for visualization. Additionally, I added methods to use Ollama OS models. 73 | - [rough_codes](https://colab.research.google.com/drive/1q6T8mK-O2XKqY-iGFz6xdrzvqLzu73lm#scrollTo=H0QG6QUVub8T) contains codes/modification/implementation for the rep0 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | project explainer 3 | 4 | [Project Explainer (as module)](#project-explainer-as-module) • 5 | [Project Explainer (as ui)](#project-explainer-as-ui) • 6 | [Project Repository Utilities (gh_processor py module)](#project-repository-utilities-gh_processor-py-module) 7 |
8 | 9 |
10 | 11 | Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository. 12 | 13 | 14 | ## Tools 15 | 16 | ### Project Explainer (as module) 17 | 18 | A python module that is capable of providing different levels of summary for the give github repo using transformer models 19 | 20 | #### Installation 21 | 22 | ``` 23 | pip install git+https://github.com/c2siorg/Project-Explainer.git@main#subdirectory=project_explainer&egg=gh_explainer 24 | ``` 25 | 26 | #### Example usage 27 | 28 | ```python 29 | from project_explainer import Explainer 30 | 31 | gptExplainer = Explainer("gpt2") 32 | 33 | print(gptExplainer.brief("https://github.com/c2siorg/Project-Explainer.git")) 34 | ``` 35 | 36 | #### Output 37 | 38 | ``` 39 | {'prompt': {'prompt': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.'}, 'prepared_prompt': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.\nExplain the above : ', 'summary': 'Project-Explainer Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.\nExplain the above : \xa0The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model. The goal of this project is to provide a simple, easy to use, and very fast way to summarize a language model.'} 40 | ``` 41 | 42 | ### Project Explainer (as ui) 43 | 44 | Use project explainer as UI 45 | 46 | #### Dependencies 47 | 48 | ``` 49 | pip install -r project_explainer_ui/requirements.txt 50 | ``` 51 | 52 | #### Example usage 53 | 54 | ``` 55 | python project_explainer_ui/ui.py 56 | ``` 57 | 58 | ![](static/ui.png) 59 | 60 | 61 | ### Project Repository Utilities (gh_processor py module) 62 | 63 | A simple python module packed with utilities to process files in a project repository such as git repositories. 64 | 65 | #### Installation 66 | 67 | ``` 68 | pip install git+https://github.com/c2siorg/Project-Explainer.git@main#subdirectory=project_processor&egg=gh_processor 69 | ``` 70 | 71 | #### Example usage 72 | 73 | ```python 74 | from gh_processor import download_github_repo, extract_headings_with_paragraphs_from_markdown, get_files_by_extension 75 | 76 | git_url = "https://github.com/c2siorg/Project-Explainer.git" 77 | 78 | repo_path = download_github_repo(git_url) 79 | 80 | print(repo_path) 81 | 82 | markdown_files = get_files_by_extension(repo_path, [".md"]) 83 | 84 | headings_with_content = {} 85 | 86 | print(markdown_files) 87 | 88 | for markdown_file in markdown_files: 89 | print(markdown_file) 90 | headings_with_content[markdown_file] = extract_headings_with_paragraphs_from_markdown(markdown_file) 91 | 92 | print(headings_with_content) 93 | ``` 94 | 95 | Output 96 | 97 | ``` 98 | {'/Users/sripravallika/Project-Explainer/Project-Explainer/README.md': {'Project-Explainer': 'Large Language Models are picking pace very quickly and they are turning out to be extremely good in multiple tasks. With the help of zero-shot, few-shot, and fine tuning techniques we could effectively specialize a language model for the use case. Summarization is one such use case that has been widely researched for a couple of years now. Broadly there are techniques such as Abstractive and Extractive approaches. The motive of this project proposal is to handle the summarization task (mostly Abstractive + Extractive hybrid approach) through the language model’s (foundation model) lens. This project aims to cover everything from data collection, EDA, experimenting with different language models to developing production-scale system that can take GitHub repo as reference and provide summary. One of the challenges that is novel is to use smaller sized models to achieve great performance in summarization. SCoRe Lab has been into developing solutions in the space of making user life easier with products such as D4D, Bassa, Track Pal, and others. This project will add to that portfolio and would be a great reference for AI practitioners and system developers which aims to work right from data to production-grade end product using AI and Systems. This repository will hold, data/data references, experiments, and a system that takes GitHub Link as input and provides a summary for the repository.'}} 99 | ``` 100 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /experiments/experiment_bart_ft_abs_summarization/experiment_bart_ft_abs_summarization_eval.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4" 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "source": [ 22 | "# import locale\n", 23 | "# locale.getpreferredencoding = lambda: \"UTF-8\"" 24 | ], 25 | "metadata": { 26 | "id": "60IU1eNJyJ4G" 27 | }, 28 | "execution_count": 1, 29 | "outputs": [] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "source": [ 34 | "!pip install transformers datasets evaluate rouge_score sentencepiece" 35 | ], 36 | "metadata": { 37 | "colab": { 38 | "base_uri": "https://localhost:8080/" 39 | }, 40 | "id": "HejdwoIRyD50", 41 | "outputId": "f8788dd3-0c37-40cf-c421-63dfc4b33fd7" 42 | }, 43 | "execution_count": 2, 44 | "outputs": [ 45 | { 46 | "output_type": "stream", 47 | "name": "stdout", 48 | "text": [ 49 | "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.32.0)\n", 50 | "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.14.4)\n", 51 | "Requirement already satisfied: evaluate in /usr/local/lib/python3.10/dist-packages (0.4.0)\n", 52 | "Requirement already satisfied: rouge_score in /usr/local/lib/python3.10/dist-packages (0.1.2)\n", 53 | "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.1.99)\n", 54 | "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", 55 | "Requirement already satisfied: huggingface-hub<1.0,>=0.15.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.16.4)\n", 56 | "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", 57 | "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", 58 | "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", 59 | "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", 60 | "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", 61 | "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n", 62 | "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.3)\n", 63 | "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", 64 | "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n", 65 | "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.7)\n", 66 | "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", 67 | "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.3.0)\n", 68 | "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.15)\n", 69 | "Requirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", 70 | "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.8.5)\n", 71 | "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.10/dist-packages (from evaluate) (0.18.0)\n", 72 | "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge_score) (1.4.0)\n", 73 | "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge_score) (3.8.1)\n", 74 | "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge_score) (1.16.0)\n", 75 | "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", 76 | "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (3.2.0)\n", 77 | "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n", 78 | "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", 79 | "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.2)\n", 80 | "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.0)\n", 81 | "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", 82 | "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.15.1->transformers) (4.7.1)\n", 83 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", 84 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.4)\n", 85 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n", 86 | "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge_score) (8.1.7)\n", 87 | "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->rouge_score) (1.3.2)\n", 88 | "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", 89 | "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.3)\n" 90 | ] 91 | } 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "source": [ 97 | "from transformers import pipeline\n" 98 | ], 99 | "metadata": { 100 | "id": "DTeMo878grDl" 101 | }, 102 | "execution_count": 3, 103 | "outputs": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "source": [ 108 | "from datasets import load_dataset\n", 109 | "\n", 110 | "billsum = load_dataset(\"billsum\", split=\"ca_test\")\n", 111 | "# xsum = load_dataset(\"xsum\", split=\"test\")" 112 | ], 113 | "metadata": { 114 | "id": "ed7TqUT4hqF-" 115 | }, 116 | "execution_count": 4, 117 | "outputs": [] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "source": [ 122 | "# xsum" 123 | ], 124 | "metadata": { 125 | "id": "bWy_szkjrMhB" 126 | }, 127 | "execution_count": 5, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "source": [ 133 | "billsum" 134 | ], 135 | "metadata": { 136 | "colab": { 137 | "base_uri": "https://localhost:8080/" 138 | }, 139 | "id": "9Iv45JnCsGqt", 140 | "outputId": "2cccf711-cd86-4c53-ad91-ee3c105b0206" 141 | }, 142 | "execution_count": 6, 143 | "outputs": [ 144 | { 145 | "output_type": "execute_result", 146 | "data": { 147 | "text/plain": [ 148 | "Dataset({\n", 149 | " features: ['text', 'summary', 'title'],\n", 150 | " num_rows: 1237\n", 151 | "})" 152 | ] 153 | }, 154 | "metadata": {}, 155 | "execution_count": 6 156 | } 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "source": [ 162 | "from tqdm import tqdm" 163 | ], 164 | "metadata": { 165 | "id": "i-eSVEM9t9R2" 166 | }, 167 | "execution_count": 7, 168 | "outputs": [] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "source": [ 173 | "# xsum_pred = []\n", 174 | "# for doc in tqdm(xsum[\"document\"], total=len(xsum[\"document\"])):\n", 175 | "# prompt = doc + \"\\nsummarize:\"\n", 176 | "# inputs = tokenizer.encode(prompt, return_tensors='pt', max_length=1024, truncation=True).to(\"cuda\")\n", 177 | "# output = model.generate(inputs)\n", 178 | "# xsum_pred.append(tokenizer.decode(output[0], skip_special_tokens=True))\n" 179 | ], 180 | "metadata": { 181 | "id": "EoFqVBEksKW6" 182 | }, 183 | "execution_count": 8, 184 | "outputs": [] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "source": [ 189 | "summarizer = pipeline(\"summarization\", model=\"knkarthick/MEETING_SUMMARY\", device=\"cuda:0\")\n" 190 | ], 191 | "metadata": { 192 | "id": "_tHpEqZV1EQY" 193 | }, 194 | "execution_count": null, 195 | "outputs": [] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "source": [ 200 | "tokenizer_kwargs = {'truncation':True,'max_length':512,'return_tensors':'pt'}" 201 | ], 202 | "metadata": { 203 | "id": "-FQLtsSr2-jX" 204 | }, 205 | "execution_count": null, 206 | "outputs": [] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "source": [ 211 | "billsum_pred = []\n", 212 | "for doc in tqdm(billsum[\"text\"], total=len(billsum[\"text\"])):\n", 213 | "\n", 214 | " output = summarizer(doc, **tokenizer_kwargs)\n", 215 | " billsum_pred.append(output[0][\"summary_text\"])" 216 | ], 217 | "metadata": { 218 | "id": "b9AKooOasS_t" 219 | }, 220 | "execution_count": null, 221 | "outputs": [] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "source": [ 226 | "import evaluate" 227 | ], 228 | "metadata": { 229 | "id": "sdn5p-2wwd8w" 230 | }, 231 | "execution_count": null, 232 | "outputs": [] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "source": [ 237 | "bleu = evaluate.load(\"bleu\")" 238 | ], 239 | "metadata": { 240 | "id": "NHkg44iKuWhf" 241 | }, 242 | "execution_count": null, 243 | "outputs": [] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "source": [ 248 | "print(billsum[\"summary\"])" 249 | ], 250 | "metadata": { 251 | "id": "59KjrRhTwtqY" 252 | }, 253 | "execution_count": null, 254 | "outputs": [] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "source": [ 259 | "results = bleu.compute(predictions=billsum_pred, references=billsum[\"summary\"])" 260 | ], 261 | "metadata": { 262 | "id": "g4LYYPthwcgs" 263 | }, 264 | "execution_count": null, 265 | "outputs": [] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "source": [ 270 | "results" 271 | ], 272 | "metadata": { 273 | "id": "pO8xelg4wqRK" 274 | }, 275 | "execution_count": null, 276 | "outputs": [] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "source": [ 281 | "rouge = evaluate.load(\"rouge\")" 282 | ], 283 | "metadata": { 284 | "id": "73RNJhvIw3aJ" 285 | }, 286 | "execution_count": null, 287 | "outputs": [] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "source": [ 292 | "rouge_results = rouge.compute(predictions=billsum_pred, references=billsum[\"summary\"])" 293 | ], 294 | "metadata": { 295 | "id": "HlOybJAFxDI2" 296 | }, 297 | "execution_count": null, 298 | "outputs": [] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "source": [ 303 | "rouge_results" 304 | ], 305 | "metadata": { 306 | "id": "xNgumPMmxK2u" 307 | }, 308 | "execution_count": null, 309 | "outputs": [] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "source": [], 314 | "metadata": { 315 | "id": "Rx6smpcCxMgk" 316 | }, 317 | "execution_count": null, 318 | "outputs": [] 319 | } 320 | ] 321 | } -------------------------------------------------------------------------------- /project_processor/gh_processor/file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Dict 3 | import re 4 | import markdown2 5 | import spacy 6 | 7 | 8 | def get_files_by_extension(directory: str, extensions: List[str]) -> List[str]: 9 | """ 10 | Retrieve the paths of files in the given directory that match the specified file extensions. 11 | 12 | Args: 13 | directory (str): The directory path to search for files. 14 | extensions (list): A list of file extensions to match. 15 | 16 | Returns: 17 | list: A list of file paths that match the given extensions. 18 | """ 19 | file_paths = [] 20 | 21 | for root, dirs, files in os.walk(directory): 22 | for file in files: 23 | file_extension = os.path.splitext(file)[1] 24 | if file_extension in extensions: 25 | file_path = os.path.join(root, file) 26 | file_paths.append(file_path) 27 | 28 | return file_paths 29 | 30 | 31 | def extract_code_blocks_from_markdown(file_path: str) -> List[str]: 32 | """ 33 | Extract code blocks from a Markdown file. 34 | 35 | Args: 36 | file_path (str): The path of the Markdown file. 37 | 38 | Returns: 39 | List[str]: A list of code blocks extracted from the Markdown file. 40 | """ 41 | if not file_path.endswith(".md"): 42 | raise ValueError("The provided file is not a Markdown file.") 43 | 44 | code_blocks = [] 45 | 46 | with open(file_path, "r") as file: 47 | lines = file.readlines() 48 | code_block = [] 49 | in_code_block = False 50 | 51 | for line in lines: 52 | if line.startswith("```"): 53 | if in_code_block: 54 | code_blocks.append("".join(code_block)) 55 | code_block = [] 56 | in_code_block = False 57 | else: 58 | in_code_block = True 59 | elif in_code_block: 60 | code_block.append(line) 61 | 62 | return code_blocks 63 | 64 | 65 | def extract_links_from_markdown(file_path: str) -> List[str]: 66 | """ 67 | Extract links from a Markdown file. 68 | 69 | Args: 70 | file_path (str): The path of the Markdown file. 71 | 72 | Returns: 73 | List[str]: A list of links extracted from the Markdown file. 74 | """ 75 | if not file_path.endswith(".md"): 76 | raise ValueError("The provided file is not a Markdown file.") 77 | 78 | links = [] 79 | 80 | with open(file_path, "r") as file: 81 | content = file.read() 82 | link_pattern = r"\[(.*?)\]\((.*?)\)" 83 | matches = re.findall(link_pattern, content) 84 | 85 | for match in matches: 86 | link_text, link_url = match 87 | links.append(link_url) 88 | 89 | return links 90 | 91 | 92 | def extract_images_from_markdown(file_path: str) -> List[str]: 93 | """ 94 | Extract image URLs from a Markdown file. 95 | 96 | Args: 97 | file_path (str): The path of the Markdown file. 98 | 99 | Returns: 100 | List[str]: A list of image URLs extracted from the Markdown file. 101 | """ 102 | if not file_path.endswith(".md"): 103 | raise ValueError("The provided file is not a Markdown file.") 104 | 105 | images = [] 106 | 107 | with open(file_path, "r") as file: 108 | content = file.read() 109 | image_pattern = r"!\[(.*?)\]\((.*?)\)" 110 | matches = re.findall(image_pattern, content) 111 | 112 | for match in matches: 113 | alt_text, image_url = match 114 | images.append(image_url) 115 | 116 | return images 117 | 118 | 119 | def extract_headings_with_paragraphs_from_markdown(file_path: str) -> dict: 120 | """ 121 | Extract headings and the paragraph text below each heading from a Markdown file. 122 | 123 | Args: 124 | file_path (str): The path of the Markdown file. 125 | 126 | Returns: 127 | dict: A dictionary where the keys are the headings and the values are the corresponding paragraphs. 128 | """ 129 | if not file_path.endswith(".md"): 130 | raise ValueError("The provided file is not a Markdown file.") 131 | 132 | heading_paragraphs = {} 133 | 134 | with open(file_path, "r") as file: 135 | content = file.read() 136 | heading_pattern = r"#+\s(.+)" 137 | matches = re.findall(heading_pattern, content) 138 | 139 | for match in matches: 140 | heading = match 141 | next_line_index = content.index(match) + len(match) + 1 142 | next_line = content[next_line_index:].strip() 143 | 144 | if next_line.startswith("#"): 145 | paragraph = "" 146 | else: 147 | paragraph = next_line 148 | 149 | heading_paragraphs[heading] = paragraph 150 | 151 | return heading_paragraphs 152 | 153 | 154 | def extract_tables_from_markdown(file_path: str) -> List[List[str]]: 155 | """ 156 | Extract tables from a Markdown file. 157 | 158 | Args: 159 | file_path (str): The path of the Markdown file. 160 | 161 | Returns: 162 | List[List[str]]: A list of tables extracted from the Markdown file. 163 | """ 164 | if not file_path.endswith(".md"): 165 | raise ValueError("The provided file is not a Markdown file.") 166 | 167 | tables = [] 168 | 169 | with open(file_path, "r") as file: 170 | content = file.read() 171 | table_pattern = r"\|(.+)\|(\n\|.+)+\n?" 172 | matches = re.findall(table_pattern, content) 173 | 174 | for match in matches: 175 | table_lines = match[0].split("\n|")[1:] 176 | table = [line.strip().split("|") for line in table_lines] 177 | tables.append(table) 178 | 179 | return tables 180 | 181 | 182 | def extract_project_description_from_readme(file_path: str) -> str: 183 | """ 184 | Extract the project description from a README.md file. 185 | 186 | Args: 187 | file_path (str): The path of the README.md file. 188 | 189 | Returns: 190 | str: The project description extracted from the README.md file. 191 | """ 192 | if not file_path.endswith(".md"): 193 | raise ValueError("The provided file is not a .md file.") 194 | 195 | with open(file_path, "r") as file: 196 | lines = file.readlines() 197 | description = "" 198 | in_description = False 199 | 200 | for line in lines: 201 | line = line.strip() 202 | 203 | if not line: 204 | continue 205 | 206 | if not in_description: 207 | if line.lower().startswith("#"): 208 | in_description = True 209 | description += line.lstrip("#").strip() + " " 210 | else: 211 | if line.lower().startswith("#"): 212 | break 213 | else: 214 | description += line + " " 215 | 216 | return description.strip() 217 | 218 | 219 | def convert_markdown_to_html(markdown_text: str) -> str: 220 | """ 221 | Convert Markdown text to html. 222 | 223 | Args: 224 | markdown_text (str): The Markdown text to be converted. 225 | 226 | Returns: 227 | str: The html equivalent text of the Markdown text. 228 | """ 229 | plain_text = markdown2.markdown( 230 | markdown_text, extras=["tables", "fenced-code-blocks"]) 231 | return plain_text 232 | 233 | 234 | def convert_markdown_file_to_html(file_path: str) -> str: 235 | """ 236 | Convert Markdown file to html. 237 | 238 | Args: 239 | file_path (str): The path to the Markdown file. 240 | 241 | Returns: 242 | str: The html equivalent content of the Markdown file. 243 | 244 | Raises: 245 | ValueError: If the file is not a Markdown file. 246 | """ 247 | if not file_path.lower().endswith('.md'): 248 | raise ValueError("The file is not a Markdown file.") 249 | 250 | with open(file_path, 'r') as file: 251 | markdown_text = file.read() 252 | 253 | html_content = convert_markdown_to_html(markdown_text) 254 | 255 | return html_content 256 | 257 | 258 | def check_phrase_similarity_using_spacyweb(phrase1: str, phrase2: str, threshold: float = 0.5) -> bool: 259 | """ 260 | Checks the similarity between two phrases using spaCy's pre-trained word vectors. 261 | 262 | Args: 263 | phrase1 (str): The first phrase. 264 | phrase2 (str): The second phrase. 265 | threshold (float): The threshold similarity score. 266 | 267 | Returns: 268 | bool: True if the similarity score is above the threshold, False otherwise. 269 | """ 270 | # python -m spacy download en_core_web_lg 271 | nlp = spacy.load("en_core_web_lg") 272 | 273 | doc1 = nlp(phrase1) 274 | doc2 = nlp(phrase2) 275 | 276 | similarity_score = doc1.similarity(doc2) 277 | 278 | return similarity_score >= threshold 279 | 280 | 281 | def check_similarity(text1: str, text2: str, strategy: str = "in") -> bool: 282 | """ 283 | Checks the similarity between two texts using different strategies. 284 | 285 | Args: 286 | text1 (str): The first text. 287 | text2 (str): The second text. 288 | strategy (str, optional): The strategy to use for similarity check. 289 | Valid options are: 290 | - "in": Checks if one text is contained within the other. 291 | - "spacy_web": Checks similarity using spaCy's pre-trained word vectors. 292 | 293 | Returns: 294 | bool: True if the texts are similar based on the chosen strategy, False otherwise. 295 | """ 296 | if strategy == "in": 297 | return (text1 in text2) or (text2 in text1) 298 | elif strategy == "spacy_web": 299 | return check_phrase_similarity_using_spacyweb(text1, text2, 0.5) 300 | 301 | 302 | def remove_sections_from_markdown(markdown_content: List[str], headings: List[str], strategy: str = "in") -> List[str]: 303 | """ 304 | Removes sections from Markdown content based on a heading and similarity strategy. 305 | 306 | Args: 307 | markdown_content (List[str]): The list of lines in the Markdown content. 308 | headings (List[str]): List of headings to search for and remove along with its sections. 309 | strategy (str, optional): The strategy to use for similarity check. Valid options are: 310 | - "in": Checks if the heading is contained within the line. 311 | - "spacy_web": Checks similarity using spaCy's pre-trained word vectors. 312 | 313 | Returns: 314 | List[str]: The updated Markdown content with the specified sections removed. 315 | """ 316 | updated_content = [] 317 | skip_section = False 318 | 319 | for line in markdown_content: 320 | for heading in headings: 321 | if check_similarity(heading, line, strategy): 322 | skip_section = True 323 | break 324 | if not skip_section and line.startswith('#'): 325 | skip_section = False 326 | 327 | if not skip_section: 328 | updated_content.append(line) 329 | 330 | return updated_content 331 | 332 | 333 | def remove_headings_from_markdown_file(file_path: str, heading: str) -> List[str]: 334 | """ 335 | Removes the specified heading and all the subsequent subheadings and paragraphs from the markdown file. 336 | 337 | Args: 338 | file_path (str): The path to the markdown file. 339 | heading (str): The heading to be removed along with its subsequent sections. 340 | 341 | Returns: 342 | List[str]: The updated markdown content with the specified heading and its subsequent sections removed. 343 | 344 | Raises: 345 | ValueError: If the file is not a Markdown file. 346 | """ 347 | if not file_path.lower().endswith('.md'): 348 | raise ValueError("The file is not a Markdown file.") 349 | 350 | with open(file_path, 'r') as file: 351 | markdown_content = file.readlines() 352 | 353 | updated_content = remove_sections_from_markdown(markdown_content, heading) 354 | 355 | return updated_content 356 | 357 | 358 | def get_elements_from_markdown_file(file_path: str, elements: List[str]) -> Dict[str, str]: 359 | """ 360 | Extracts specific elements from a Markdown file. 361 | 362 | Args: 363 | file_path (str): The path to the Markdown file. 364 | elements (List[str]): A list of elements to extract. Valid options are: 365 | - "links": Extracts links from the Markdown file. 366 | - "images": Extracts images from the Markdown file. 367 | - "headings": Extracts headings with their corresponding paragraphs from the Markdown file. 368 | - "code": Extracts code blocks from the Markdown file. 369 | - "tables": Extracts tables from the Markdown file. 370 | - "description": Extracts the project description from a README file. 371 | 372 | Returns: 373 | Dict[str, str]: A dictionary containing the extracted elements as key-value pairs. 374 | The keys correspond to the requested elements, and the values contain the extracted content. 375 | 376 | Raises: 377 | ValueError: If the file is not a Markdown file. 378 | """ 379 | if not file_path.lower().endswith('.md'): 380 | raise ValueError("The file is not a Markdown file.") 381 | 382 | elements_to_extract = { 383 | "links": extract_links_from_markdown, 384 | "images": extract_images_from_markdown, 385 | "headings": extract_headings_with_paragraphs_from_markdown, 386 | "code": extract_code_blocks_from_markdown, 387 | "tables": extract_tables_from_markdown, 388 | "description": extract_project_description_from_readme 389 | } 390 | 391 | result = {} 392 | 393 | for element in elements: 394 | if element not in elements_to_extract.keys(): 395 | continue 396 | result[element] = elements_to_extract.get(element)(file_path) 397 | 398 | return result 399 | 400 | 401 | def remove_images_from_markdown(file_path: str) -> str: 402 | """ 403 | Removes image tags from a Markdown file and returns the updated content without images. 404 | 405 | Args: 406 | file_path: The path to the Markdown file. 407 | 408 | Returns: 409 | The Markdown content without images. 410 | 411 | Raises: 412 | ValueError: If the provided file is not a Markdown file or if the file does not exist. 413 | """ 414 | 415 | if not file_path.lower().endswith('.md'): 416 | raise ValueError( 417 | "Invalid file. Only Markdown files (.md) are supported.") 418 | 419 | if not os.path.isfile(file_path): 420 | raise ValueError("File not found.") 421 | 422 | with open(file_path, 'r') as f: 423 | markdown_content = f.read() 424 | 425 | markdown_content_without_images = re.sub( 426 | '!\[.*?\]\(.*?\)', '', markdown_content) 427 | 428 | return markdown_content_without_images 429 | 430 | 431 | def remove_links_from_markdown(file_path: str) -> str: 432 | """ 433 | Removes link tags from a Markdown file and returns the updated content. 434 | 435 | Args: 436 | file_path: The path to the Markdown file. 437 | 438 | Returns: 439 | The Markdown content without links. 440 | 441 | Raises: 442 | ValueError: If the provided file is not a Markdown file or if the file does not exist. 443 | """ 444 | 445 | if not file_path.lower().endswith('.md'): 446 | raise ValueError( 447 | "Invalid file. Only Markdown files (.md) are supported.") 448 | 449 | if not os.path.isfile(file_path): 450 | raise ValueError("File not found.") 451 | 452 | with open(file_path, 'r') as f: 453 | markdown_content = f.read() 454 | 455 | markdown_content_without_links = re.sub( 456 | '\[.*?\]\(.*?\)', '', markdown_content) 457 | 458 | return markdown_content_without_links 459 | 460 | 461 | def remove_code_blocks_from_markdown(file_path: str) -> str: 462 | """ 463 | Removes code blocks from a Markdown file and returns the updated content. 464 | 465 | Args: 466 | file_path: The path to the Markdown file. 467 | 468 | Returns: 469 | The Markdown content without code blocks. 470 | 471 | Raises: 472 | ValueError: If the provided file is not a Markdown file or if the file does not exist. 473 | """ 474 | 475 | if not file_path.lower().endswith('.md'): 476 | raise ValueError( 477 | "Invalid file. Only Markdown files (.md) are supported.") 478 | 479 | if not os.path.isfile(file_path): 480 | raise ValueError("File not found.") 481 | 482 | with open(file_path, 'r') as f: 483 | markdown_content = f.read() 484 | 485 | markdown_content_without_code_blocks = re.sub( 486 | '```[\s\S]*?```', '', markdown_content) 487 | 488 | return markdown_content_without_code_blocks 489 | 490 | 491 | def remove_tables_from_markdown(file_path: str) -> str: 492 | """ 493 | Removes tables from a Markdown file and returns the updated content. 494 | 495 | Args: 496 | file_path: The path to the Markdown file. 497 | 498 | Returns: 499 | The Markdown content without tables. 500 | 501 | Raises: 502 | ValueError: If the provided file is not a Markdown file or if the file does not exist. 503 | """ 504 | 505 | if not file_path.lower().endswith('.md'): 506 | raise ValueError( 507 | "Invalid file. Only Markdown files (.md) are supported.") 508 | 509 | if not os.path.isfile(file_path): 510 | raise ValueError("File not found.") 511 | 512 | with open(file_path, 'r') as f: 513 | markdown_content = f.read() 514 | 515 | markdown_content_without_tables = re.sub( 516 | r'\n\|.*\|\n\|.*\|\n(\|.*\|)+', '', markdown_content) 517 | 518 | return markdown_content_without_tables 519 | --------------------------------------------------------------------------------