├── README.md ├── llamaindex_vector_search.ts ├── LICENSE ├── LlamaIndex_QA_+_Summary.ipynb └── Hybrid_Search.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # LlamaIndex tutorials 2 | 3 | Overview and tutorials of the LlamaIndex Library 4 | 5 | ### Getting Started 6 | 7 | Videos coming soon https://www.youtube.com/@AnilChandraNaiduMatcha 8 | .Subscribe to the channel to get latest content 9 | 10 | Follow [Anil Chandra Naidu Matcha](https://twitter.com/matchaman11) on twitter for updates 11 | 12 | Join our discord server for support https://discord.gg/FBpafqbbYF 13 | 14 | ### Also check 15 | 16 | [LlamaIndex Course](https://github.com/SamurAIGPT/LlamaIndex-course) 17 | -------------------------------------------------------------------------------- /llamaindex_vector_search.ts: -------------------------------------------------------------------------------- 1 | import fs from "fs/promises"; 2 | import { Document, VectorStoreIndex } from "llamaindex"; 3 | 4 | async function main() { 5 | const essay = await fs.readFile( 6 | "node_modules/llamaindex/examples/abramov.txt", 7 | "utf-8" 8 | ); 9 | const document = new Document({ text: essay }); 10 | const index = await VectorStoreIndex.fromDocuments([document]); 11 | const queryEngine = index.asQueryEngine(); 12 | const response = await queryEngine.query( 13 | "What did the author do in college?" 14 | ); 15 | console.log(response.toString()); 16 | } 17 | 18 | main() 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Anil Chandra Naidu Matcha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LlamaIndex_QA_+_Summary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyMUcGEpWYrW1phg4PRIVNjW", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "source": [ 32 | "!pip install llama-index" 33 | ], 34 | "metadata": { 35 | "id": "2vPspYq2pXcd" 36 | }, 37 | "execution_count": null, 38 | "outputs": [] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 1, 43 | "metadata": { 44 | "id": "jkeqbN1MorPp" 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "import nest_asyncio\n", 49 | "nest_asyncio.apply()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "source": [ 55 | "from llama_index import (\n", 56 | " VectorStoreIndex,\n", 57 | " ListIndex,\n", 58 | " SimpleDirectoryReader,\n", 59 | " ServiceContext,\n", 60 | " StorageContext)" 61 | ], 62 | "metadata": { 63 | "id": "krd8IAm0pL6G" 64 | }, 65 | "execution_count": 4, 66 | "outputs": [] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "source": [ 71 | "!wget https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt\n", 72 | "!mkdir data\n", 73 | "!mv state_of_the_union.txt data/" 74 | ], 75 | "metadata": { 76 | "id": "tW-R_LqApiKE" 77 | }, 78 | "execution_count": null, 79 | "outputs": [] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "source": [ 84 | "documents = SimpleDirectoryReader('./data').load_data()" 85 | ], 86 | "metadata": { 87 | "id": "fX4ER_FSp99s" 88 | }, 89 | "execution_count": 12, 90 | "outputs": [] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "source": [ 95 | "import os\n", 96 | "os.environ[\"OPENAI_API_KEY\"] = \"openai-key\"\n", 97 | "service_context = ServiceContext.from_defaults(chunk_size=1024)\n", 98 | "nodes = service_context.node_parser.get_nodes_from_documents(documents)" 99 | ], 100 | "metadata": { 101 | "id": "BnyPxCn6qqy9" 102 | }, 103 | "execution_count": 13, 104 | "outputs": [] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "source": [ 109 | "storage_context = StorageContext.from_defaults()\n", 110 | "storage_context.docstore.add_documents(nodes)" 111 | ], 112 | "metadata": { 113 | "id": "IbOM375AqvSY" 114 | }, 115 | "execution_count": 14, 116 | "outputs": [] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "source": [ 121 | "list_index = ListIndex(nodes, storage_context=storage_context)\n", 122 | "vector_index = VectorStoreIndex(nodes, storage_context=storage_context)" 123 | ], 124 | "metadata": { 125 | "id": "mCMs4gb9rHe3" 126 | }, 127 | "execution_count": 15, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "source": [ 133 | "list_query_engine = list_index.as_query_engine(\n", 134 | " response_mode='tree_summarize',\n", 135 | " use_async=True,\n", 136 | ")\n", 137 | "vector_query_engine = vector_index.as_query_engine()" 138 | ], 139 | "metadata": { 140 | "id": "DslYNvuCrJq1" 141 | }, 142 | "execution_count": 16, 143 | "outputs": [] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "source": [ 148 | "from llama_index.tools.query_engine import QueryEngineTool\n", 149 | "\n", 150 | "\n", 151 | "list_tool = QueryEngineTool.from_defaults(\n", 152 | " query_engine=list_query_engine,\n", 153 | " description='Useful for summarization questions related to Paul Graham eassy on What I Worked On.',\n", 154 | ")\n", 155 | "\n", 156 | "vector_tool = QueryEngineTool.from_defaults(\n", 157 | " query_engine=vector_query_engine,\n", 158 | " description='Useful for retrieving specific context from Paul Graham essay on What I Worked On.',\n", 159 | "\n", 160 | ")" 161 | ], 162 | "metadata": { 163 | "id": "Pgn29m0qrSel" 164 | }, 165 | "execution_count": 17, 166 | "outputs": [] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "source": [ 171 | "from llama_index.query_engine.router_query_engine import RouterQueryEngine\n", 172 | "from llama_index.selectors.llm_selectors import LLMSingleSelector\n", 173 | "\n", 174 | "\n", 175 | "query_engine = RouterQueryEngine(\n", 176 | " selector=LLMSingleSelector.from_defaults(),\n", 177 | " query_engine_tools=[\n", 178 | " list_tool,\n", 179 | " vector_tool,\n", 180 | " ]\n", 181 | ")" 182 | ], 183 | "metadata": { 184 | "id": "ms5ovM14rXS5" 185 | }, 186 | "execution_count": 18, 187 | "outputs": [] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "source": [ 192 | "print(query_engine.query('What is the summary of the document?').response)" 193 | ], 194 | "metadata": { 195 | "colab": { 196 | "base_uri": "https://localhost:8080/" 197 | }, 198 | "id": "AX4ldlPiraEA", 199 | "outputId": "e88c14fa-60fa-438f-83ac-02890722a8e9" 200 | }, 201 | "execution_count": 23, 202 | "outputs": [ 203 | { 204 | "output_type": "stream", 205 | "name": "stdout", 206 | "text": [ 207 | "\n", 208 | "In this speech, President Biden calls on Congress to pass legislation to fight inflation, lower the cost of prescription drugs, cut energy costs, end the shutdown of schools and businesses, continue vaccinating the world, and protect communities by investing in crime prevention and community police officers. He also calls for universal background checks and the repeal of the liability shield for gun manufacturers. Finally, he calls for the passage of the Freedom to Vote Act, the John Lewis Voting Rights Act, and the Disclose Act to protect the right to vote.\n" 209 | ] 210 | } 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "source": [ 216 | "print(query_engine.query('What is NaTO?').response)" 217 | ], 218 | "metadata": { 219 | "colab": { 220 | "base_uri": "https://localhost:8080/" 221 | }, 222 | "id": "uKsa35JUrcUr", 223 | "outputId": "fc4a817c-6428-4f7c-ed1f-b352d85e74f9" 224 | }, 225 | "execution_count": 22, 226 | "outputs": [ 227 | { 228 | "output_type": "stream", 229 | "name": "stdout", 230 | "text": [ 231 | "\n", 232 | "NATO is the North Atlantic Treaty Organization, an intergovernmental military alliance between 30 North American and European countries. It was created after World War II to secure peace and stability in Europe.\n" 233 | ] 234 | } 235 | ] 236 | } 237 | ] 238 | } -------------------------------------------------------------------------------- /Hybrid_Search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyMUQHRTuOH5w9vuDJMk/ZyW", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "source": [ 32 | "### Custom Retriever with Hybrid Search\n", 33 | "\n", 34 | "Keyword based search was the initial form of search used in information retrieval systems. Then recently we have Vector db based search which works based on semantic similarity.\n", 35 | "\n", 36 | "It is not always necessary that a Vector db backed search performs better than a keyword based search on a particular query. It can be vice-versa.\n", 37 | "\n", 38 | "Thus to overcome this, we can use Hybrid search which results in best of both worlds. Let's discuss how we can achieve this with the help of a Custom Retreiver in LlamaIndex" 39 | ], 40 | "metadata": { 41 | "id": "Gb-Wn7kz5q8_" 42 | } 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "id": "BsTd1h5d5cdJ" 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "!pip install llama-index" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "source": [ 58 | "import logging\n", 59 | "import sys\n", 60 | "\n", 61 | "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", 62 | "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", 63 | "\n", 64 | "from llama_index import (\n", 65 | " VectorStoreIndex,\n", 66 | " SimpleKeywordTableIndex,\n", 67 | " SimpleDirectoryReader,\n", 68 | " ServiceContext,\n", 69 | " StorageContext,\n", 70 | ")\n", 71 | "from IPython.display import Markdown, display" 72 | ], 73 | "metadata": { 74 | "id": "uM4iEm7S5yD5" 75 | }, 76 | "execution_count": 2, 77 | "outputs": [] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "source": [ 82 | "!mkdir data\n", 83 | "!wget https://github.com/jerryjliu/llama_index/blob/main/examples/paul_graham_essay/data/paul_graham_essay.txt\n", 84 | "!mv paul_graham_essay.txt data/\n", 85 | "documents = SimpleDirectoryReader(\"./data/\").load_data()" 86 | ], 87 | "metadata": { 88 | "colab": { 89 | "base_uri": "https://localhost:8080/" 90 | }, 91 | "id": "FDT5zfhp5yYd", 92 | "outputId": "622e478f-4f63-445e-b47d-2fc38cf6a67b" 93 | }, 94 | "execution_count": 4, 95 | "outputs": [ 96 | { 97 | "output_type": "stream", 98 | "name": "stdout", 99 | "text": [ 100 | "--2023-07-22 18:40:50-- https://github.com/jerryjliu/llama_index/blob/main/examples/paul_graham_essay/data/paul_graham_essay.txt\n", 101 | "Resolving github.com (github.com)... 192.30.255.113\n", 102 | "Connecting to github.com (github.com)|192.30.255.113|:443... connected.\n", 103 | "HTTP request sent, awaiting response... 200 OK\n", 104 | "Length: 84954 (83K) [text/plain]\n", 105 | "Saving to: ‘paul_graham_essay.txt’\n", 106 | "\n", 107 | "\rpaul_graham_essay.t 0%[ ] 0 --.-KB/s \rpaul_graham_essay.t 100%[===================>] 82.96K --.-KB/s in 0.02s \n", 108 | "\n", 109 | "2023-07-22 18:40:51 (4.76 MB/s) - ‘paul_graham_essay.txt’ saved [84954/84954]\n", 110 | "\n" 111 | ] 112 | } 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "source": [ 118 | "service_context = ServiceContext.from_defaults(chunk_size=1024)\n", 119 | "node_parser = service_context.node_parser\n", 120 | "nodes = node_parser.get_nodes_from_documents(documents)\n", 121 | "storage_context = StorageContext.from_defaults()\n", 122 | "storage_context.docstore.add_documents(nodes)" 123 | ], 124 | "metadata": { 125 | "id": "Lfktds3D50LQ" 126 | }, 127 | "execution_count": 5, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "source": [ 133 | "import openai\n", 134 | "openai.api_key = \"your-openai-key\"" 135 | ], 136 | "metadata": { 137 | "id": "qJBFXwjc6IMP" 138 | }, 139 | "execution_count": 7, 140 | "outputs": [] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "source": [ 145 | "vector_index = VectorStoreIndex(nodes, storage_context=storage_context)\n", 146 | "keyword_index = SimpleKeywordTableIndex(nodes, storage_context=storage_context)" 147 | ], 148 | "metadata": { 149 | "colab": { 150 | "base_uri": "https://localhost:8080/" 151 | }, 152 | "id": "98cSPPBW55IY", 153 | "outputId": "d84a4795-0013-443e-838f-f096dddc83bd" 154 | }, 155 | "execution_count": 8, 156 | "outputs": [ 157 | { 158 | "output_type": "stream", 159 | "name": "stderr", 160 | "text": [ 161 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 162 | "[nltk_data] Unzipping corpora/stopwords.zip.\n" 163 | ] 164 | } 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "source": [ 170 | "from llama_index import QueryBundle\n", 171 | "\n", 172 | "# import NodeWithScore\n", 173 | "from llama_index.schema import NodeWithScore\n", 174 | "\n", 175 | "# Retrievers\n", 176 | "from llama_index.retrievers import (\n", 177 | " BaseRetriever,\n", 178 | " VectorIndexRetriever,\n", 179 | " KeywordTableSimpleRetriever,\n", 180 | ")\n", 181 | "\n", 182 | "from typing import List\n", 183 | "\n", 184 | "class CustomRetriever(BaseRetriever):\n", 185 | " \"\"\"Custom retriever that performs both semantic search and hybrid search.\"\"\"\n", 186 | "\n", 187 | " def __init__(\n", 188 | " self,\n", 189 | " vector_retriever: VectorIndexRetriever,\n", 190 | " keyword_retriever: KeywordTableSimpleRetriever,\n", 191 | " mode: str = \"AND\",\n", 192 | " ) -> None:\n", 193 | " \"\"\"Init params.\"\"\"\n", 194 | "\n", 195 | " self._vector_retriever = vector_retriever\n", 196 | " self._keyword_retriever = keyword_retriever\n", 197 | " if mode not in (\"AND\", \"OR\"):\n", 198 | " raise ValueError(\"Invalid mode.\")\n", 199 | " self._mode = mode\n", 200 | "\n", 201 | " def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:\n", 202 | " \"\"\"Retrieve nodes given query.\"\"\"\n", 203 | "\n", 204 | " vector_nodes = self._vector_retriever.retrieve(query_bundle)\n", 205 | " keyword_nodes = self._keyword_retriever.retrieve(query_bundle)\n", 206 | "\n", 207 | " vector_ids = {n.node.node_id for n in vector_nodes}\n", 208 | " keyword_ids = {n.node.node_id for n in keyword_nodes}\n", 209 | "\n", 210 | " combined_dict = {n.node.node_id: n for n in vector_nodes}\n", 211 | " combined_dict.update({n.node.node_id: n for n in keyword_nodes})\n", 212 | "\n", 213 | " if self._mode == \"AND\":\n", 214 | " retrieve_ids = vector_ids.intersection(keyword_ids)\n", 215 | " else:\n", 216 | " retrieve_ids = vector_ids.union(keyword_ids)\n", 217 | "\n", 218 | " retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]\n", 219 | " return retrieve_nodes" 220 | ], 221 | "metadata": { 222 | "id": "BSzfu6Xw576c" 223 | }, 224 | "execution_count": 9, 225 | "outputs": [] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "source": [ 230 | "from llama_index import get_response_synthesizer\n", 231 | "from llama_index.query_engine import RetrieverQueryEngine\n", 232 | "\n", 233 | "# define custom retriever\n", 234 | "vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=2)\n", 235 | "keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index)\n", 236 | "custom_retriever = CustomRetriever(vector_retriever, keyword_retriever)\n", 237 | "\n", 238 | "# define response synthesizer\n", 239 | "response_synthesizer = get_response_synthesizer()\n", 240 | "\n", 241 | "# assemble query engine\n", 242 | "custom_query_engine = RetrieverQueryEngine(\n", 243 | " retriever=custom_retriever,\n", 244 | " response_synthesizer=response_synthesizer,\n", 245 | ")\n", 246 | "\n", 247 | "# vector query engine\n", 248 | "vector_query_engine = RetrieverQueryEngine(\n", 249 | " retriever=vector_retriever,\n", 250 | " response_synthesizer=response_synthesizer,\n", 251 | ")\n", 252 | "# keyword query engine\n", 253 | "keyword_query_engine = RetrieverQueryEngine(\n", 254 | " retriever=keyword_retriever,\n", 255 | " response_synthesizer=response_synthesizer,\n", 256 | ")" 257 | ], 258 | "metadata": { 259 | "id": "Rv3mhMUO5_62" 260 | }, 261 | "execution_count": 10, 262 | "outputs": [] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "source": [ 267 | "response = custom_query_engine.query(\"What did the author do during his time at YC?\")" 268 | ], 269 | "metadata": { 270 | "id": "JwQQ9EkT6B0c" 271 | }, 272 | "execution_count": 11, 273 | "outputs": [] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "source": [ 278 | "print(response)" 279 | ], 280 | "metadata": { 281 | "colab": { 282 | "base_uri": "https://localhost:8080/" 283 | }, 284 | "id": "cjSrPcpa6DW0", 285 | "outputId": "7abce0ba-74ac-4a9b-c6b5-1b7b38638692" 286 | }, 287 | "execution_count": 12, 288 | "outputs": [ 289 | { 290 | "output_type": "stream", 291 | "name": "stdout", 292 | "text": [ 293 | "\n", 294 | "The author worked on YC, writing essays, developing internal software in Arc, and creating Hacker News. He also helped select and support founders, resolve disputes between cofounders, and fight with people who maltreated the startups. He worked hard, even at the parts he didn't like, and eventually handed YC over to someone else. After his mother's death, he checked out of YC and decided to pursue painting.\n" 295 | ] 296 | } 297 | ] 298 | } 299 | ] 300 | } --------------------------------------------------------------------------------