├── .gitignore ├── LICENSE ├── README.md ├── chat_app.py ├── data ├── llama-index-tutorial │ └── paul_graham_essay.txt ├── processed_df_1692826394541260.csv └── test-data │ ├── chunked_by_header.csv │ ├── mf-docs │ └── docs │ │ ├── api │ │ └── current.md │ │ └── introduction │ │ └── why-metaflow.md │ └── ob │ ├── blog │ ├── llm-tuning-metaflow.md │ └── metaflow-fast-data.md │ └── docs │ └── data-science │ ├── compute │ └── use_multiple_cpu_cores.md │ └── orchestration │ └── dags_for_data_science.md ├── environment.yaml ├── flows ├── config │ └── repo_params.py ├── data_table_processor.py ├── lancedb_index.py ├── markdown_chunker.py ├── pinecone_index.py ├── test_simple_embedding.py └── test_simple_query.py ├── images └── cover.png ├── notebooks ├── analyze_chunks.ipynb ├── llama_index_exploration.ipynb └── markdown_repo_explorer.ipynb ├── rag_tools ├── __init__.py ├── databases │ ├── __init__.py │ └── vector_database.py ├── embedders │ ├── __init__.py │ └── embedder.py ├── filetypes │ ├── __init__.py │ └── markdown.py ├── llms │ ├── __init__.py │ └── llms_search.py └── repo │ ├── __init__.py │ └── ops.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | rag_tools/filetypes/__pycache__ 2 | rag_tools/config/__pycache__ 3 | rag_tools/repo/__pycache__ 4 | rag_tools/__pycache__ 5 | rag_tools/databases/__pycache__/ 6 | rag_tools/embedders/__pycache__/ 7 | rag_tools/llms/__pycache__/ 8 | rag_tools.egg-info 9 | notebooks/.ipynb_checkpoints 10 | .metaflow 11 | rag_demo.egg-info/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Read this blog post for context: [Retrieval-Augmented Generation: How to Use Your Data to Guide LLMs](https://outerbounds.com/blog/retrieval-augmented-generation/) 2 | 3 | 4 | 5 | 6 | 7 | 8 | # 📦 Setup 9 | 10 | ## With Conda, Mamba, or Micromamba 11 | ``` 12 | conda env create -f environment.yaml 13 | ``` 14 | 15 | ## Without Conda 16 | ``` 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | # 🔀🔁 Data collection workflows 21 | 22 | ## Chunk Markdown files in GitHub repositories 23 | 24 | Run the repository extraction and Markdown chunking flow. 25 | Looks in `/flows/config/repo_params.py` to specify new configurations; be mindful of repository licensing! 26 | ``` 27 | python flows/markdown_chunker.py run 28 | ``` 29 | 30 | ## Post-process the results 31 | 32 | Post-process the results of the `MarkdownChunker` flow. 33 | ``` 34 | python flows/data_table_processor.py run 35 | ``` 36 | 37 | ## Analyze what just happened 38 | Let's inspect the results of these workflows in a notebook. 39 | 40 | > If you are in a Conda environment, you need to install the kernel like so before opening the notebooks: 41 | > ``` 42 | > pip install ipykernel 43 | > python -m ipykernel install --user --name rag-demo --display-name "RAG demo" 44 | > ``` 45 | 46 | Open `notebooks/analyze_chunks.ipynb` to use the Metaflow Client API to explore the results of the flow runs: 47 | ``` 48 | jupyter notebook notebooks/analyze_chunks.ipynb 49 | ``` 50 | 51 | # 🦙🧠 RAG time 52 | 53 | ## LlamaIndex 54 | Now that we have indexed a bunch of source objects, let's index them and use them to modify an LLM context window. 55 | 56 | You can either use a Llama 2 model, or OpenAI APIs. 57 | Letting LlamaIndex use Llama 2 runs locally, so it doesn't require an API key but takes a lot longer. 58 | Using OpenAI APIs is faster and cheap. You can create/find your API key [here](https://platform.openai.com/account/api-keys). 59 | 60 | Open `notebooks/llama_index_exploration.ipynb` 61 | ``` 62 | jupyter notebook notebooks/llama_index_exploration.ipynb 63 | ``` 64 | 65 | ### ▶️ Run a demo in Streamlit 66 | 67 | #### 🔑 Get an OpenAI API Key 68 | Go [here](https://platform.openai.com/account/api-keys), copy your key value, and set the following environment variable: 69 | ``` 70 | export OPENAI_API_KEY= 71 | ``` 72 | 73 | #### 🤖 Run the app 74 | ``` 75 | streamlit run chat_app.py 76 | ``` 77 | 78 | ## Setup for indexing workflows 79 | There are two indexing workflows in the `/flows` folder, one indexes into Pinecone for a VectorDB and another that uses the open-source LanceDB. 80 | 81 | ### 🔑 Get a Pinecone API Key 82 | Go [here](https://app.pinecone.io/), create a Pinecone account if you have to, copy your API key, and set the following environment variable: 83 | ``` 84 | export PINECONE_API_KEY= 85 | ``` 86 | 87 | Set the following environment variable too: 88 | ``` 89 | export GCP_ENVIRONMENT=us-central1-gcp 90 | ``` 91 | 92 | ### ▶️ Run the flows 93 | ``` 94 | python flows/pinecone_index.py run 95 | python flows/lancedb_index.py run 96 | ``` -------------------------------------------------------------------------------- /chat_app.py: -------------------------------------------------------------------------------- 1 | from llama_index import Document, VectorStoreIndex, set_global_service_context, ServiceContext 2 | from llama_index.retrievers import VectorIndexRetriever 3 | from llama_index.response_synthesizers import get_response_synthesizer 4 | from llama_index.llms import OpenAI 5 | from dataclasses import dataclass 6 | from metaflow import Flow 7 | import streamlit as st 8 | from typing import List 9 | import time 10 | import os 11 | 12 | st.set_page_config(layout="centered") 13 | st.title("Metaflow Chat Bot") 14 | st.markdown("This is a bare bones LLM-powered chat bot that uses the results of Metaflow workflows to answer questions about Metaflow.") 15 | 16 | subquery_prompt = """ 17 | Answer this question only if there is relevant context below: {} 18 | If there is nothing in the context say: "Could not find relevant context." 19 | Here is the retrieved context: {} 20 | """ 21 | 22 | # model = st.text_input('OpenAI model', 'gpt-3.5-turbo') 23 | # temp = st.slider(label='Temperature', min_value=0.0, max_value=1.0, step=0.01, value=0.0) 24 | # chat_mode = st.text_input('LlamaIndex chat engine mode', 'react') 25 | # K = st.number_input('K results to return', min_value=1, max_value=5, value=2, step=1) 26 | model = 'gpt-3.5-turbo' 27 | temp = 0.0 28 | chat_mode = 'react' 29 | K = 2 30 | 31 | llm = OpenAI(model=model, temperature=temp, max_tokens=2048) 32 | service_context = ServiceContext.from_defaults(llm=llm) 33 | set_global_service_context(service_context) 34 | 35 | # find latest Metaflow run that saved processed df 36 | run = None 37 | for _run in Flow('DataTableProcessor'): 38 | if _run.data.save_processed_df: 39 | run = _run 40 | break 41 | st.write("Found processed df in run: {}".format(run.id)) 42 | 43 | @dataclass 44 | class Context: 45 | 46 | def __init__(self, response: str, source_node_ids: List[str]): 47 | self.response = response 48 | self.source_node_ids = source_node_ids 49 | 50 | def get_link_df(self, meta_df, link_col = 'doc_id'): 51 | return meta_df[meta_df[link_col].isin(self.source_node_ids)] 52 | 53 | def qa_iter( 54 | question: str, 55 | index: VectorStoreIndex, 56 | k:int = 2, 57 | response_mode:str = 'tree_summarize' 58 | ) -> Context: 59 | "Match a question against an index and returns the response." 60 | retriever = VectorIndexRetriever(index=index, similarity_top_k=k) 61 | response_synthesizer = get_response_synthesizer(response_mode=response_mode) 62 | query_engine = index.as_query_engine(response_synthesizer=response_synthesizer, retriever=retriever) 63 | query_res = query_engine.query(question) 64 | return Context( 65 | response=query_res.response, source_node_ids=list(query_res.metadata.keys()) 66 | ) 67 | 68 | def st_output_format(similar_chunk_df): 69 | md_outputs = ["#### You may find these links helpful:"] 70 | for _, chunk in similar_chunk_df.iterrows(): 71 | md_outputs.append(f"##### [{chunk.header}]({chunk.page_url})") 72 | md_outputs.append(f"{chunk.contents[:100]}...") 73 | if len(md_outputs) == 1: 74 | md_outputs = [] 75 | return md_outputs 76 | 77 | def generative_search_engine_iter(question, index, meta_df, k=2, meta_df_id_col='doc_id'): 78 | "Assumes index and df are defined in the global scope" 79 | context = qa_iter(question, index, k=k) 80 | similar_chunk_df = meta_df[meta_df[meta_df_id_col].isin(context.source_node_ids)] 81 | return context.response, st_output_format(similar_chunk_df) 82 | 83 | # use the processed df to build the index 84 | def get_documents_from_content_section_df(df): 85 | ids = []; documents = [] 86 | for i, text in enumerate(df.contents): 87 | doc = Document(text=text, id_=i) 88 | documents.append(doc) 89 | ids.append(doc.id_) 90 | return documents, ids 91 | 92 | @st.cache_resource 93 | def setup_index(): 94 | df = run.data.processed_df 95 | documents, ids = get_documents_from_content_section_df(df) 96 | df['doc_id'] = ids 97 | index = VectorStoreIndex(documents) 98 | return index, df 99 | 100 | index, df = setup_index() 101 | chat_engine = index.as_chat_engine(chat_mode=chat_mode, verbose=True, streaming=True) 102 | 103 | # Initialize chat history 104 | st.markdown("# Chat history") 105 | if "messages" not in st.session_state: 106 | st.session_state.messages = [] 107 | 108 | # Display chat messages from history on app rerun 109 | for message in st.session_state.messages: 110 | with st.chat_message(message["role"]): 111 | st.markdown(message["content"]) 112 | 113 | # React to user input 114 | if user_prompt := st.chat_input("Hey Metaflower 🌻 what's on your mind?"): 115 | 116 | # Display user message in chat message container 117 | with st.chat_message("user"): 118 | st.markdown(user_prompt) 119 | 120 | # Add user message to chat history 121 | st.session_state.messages.append({"role": "user", "content": user_prompt}) 122 | 123 | with st.spinner("Finding context..."): 124 | subquery_response, md_outputs = generative_search_engine_iter(user_prompt, index, df, K) 125 | 126 | # Display assistant response in chat message container 127 | with st.chat_message("Metaflow assistant"): 128 | 129 | message_placeholder = st.empty() 130 | 131 | streaming_response = chat_engine.stream_chat(subquery_prompt.format(user_prompt, subquery_response)) 132 | full_response = "" 133 | for text in streaming_response.response_gen: 134 | full_response += text 135 | message_placeholder.markdown(full_response + "▌") 136 | 137 | for line in md_outputs: 138 | st.markdown(line) 139 | 140 | # Add assistant response to chat history 141 | st.session_state.messages.append({"role": "Metaflow assistant", "content": full_response}) 142 | 143 | if st.button("Reset chat engine's memory"): 144 | chat_engine.reset() -------------------------------------------------------------------------------- /data/test-data/mf-docs/docs/api/current.md: -------------------------------------------------------------------------------- 1 | # Current - Operating a run 2 | 3 | 4 | 5 | The `current` object is used to inspect and manipulate the currently executing run. It is only available during flow execution, i.e. inside a `FlowSpec` class and functions called from its steps. You can access the object simply by importing it: `from metaflow import current`. 6 | 7 | The attributes available in `current` depend on the decorators assigned to the flow and the step where `current` is used. Attributes that are always available are listed under *Common Attributes* below. Decorator-specific attributes are listed under the decorator name. 8 | 9 | ## Common Attributes 10 | 11 | These attributes are always available in the `current` object. 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | ## Decorator-specific attributes 114 | 115 | These attributes are only available when the decorator is present. 116 | 117 | ### @project 118 | 119 | [The @project decorator](/production/coordinating-larger-metaflow-projects) exposes attributes related to the current deployment. 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | ### @card 172 | 173 | [The @card decorator](/metaflow/visualizing-results) exposes functions in `current` that allow you to customize 174 | the contents of cards using [card components](/api/cards#Card-components). For an overview of card-related APIs, see [the API reference for cards](/api/cards). 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /data/test-data/mf-docs/docs/introduction/why-metaflow.md: -------------------------------------------------------------------------------- 1 | --- 2 | pagination_prev: index 3 | --- 4 | 5 | import Caption from "/src/components/Caption"; 6 | 7 | # Why Metaflow 8 | 9 | ### 1. Modern businesses are eager to utilize data science and ML 10 | 11 | In the past, data scientists and ML engineers had to rely 12 | on a medley of point solutions and custom systems to build ML and data science 13 | applications. 14 | 15 | ![Many data science opportunities](/assets/mf-intro-01.png) 16 | 17 | --- 18 | 19 | ### 2. What is common in DS/ML applications? 20 | 21 | Applications can be built quicker and more robustly if they stand on a common, human-friendly foundation. But what should the foundation cover? 22 | 23 | ![A solid foundation for all use cases](/assets/mf-intro-02.png) 24 | 25 | --- 26 | 27 | ### 3. All DS/ML applications use data 28 | 29 | **Data** may come in different shapes and sizes and may be loaded from various data stores. However, no matter what data is used, accessing and processing it shouldn't be too cumbersome. 30 | 31 | ![Data](/assets/mf-intro-03.png) 32 | 33 | --- 34 | 35 | ### 4. DS/ML applications need to perform computation 36 | 37 | Some applications require a tremendous amount of compute power - think computer vision - while some do with less. 38 | Regardless of the scale, all applications need to perform **computation** reliably. Thanks to cloud computing, 39 | data scientists and ML engineers should be able to utilize elastic compute resources without friction. 40 | 41 | ![Compute](/assets/mf-intro-04.png) 42 | 43 | --- 44 | 45 | ### 5. DS/ML applications consists of multiple interconnected parts 46 | 47 | Consider an application that loads data, transforms it, trains a bunch of models, chooses the best performing one, runs 48 | inference, and writes the results to a database. Multi-steps workflows like this are a norm in ML. **A workflow 49 | orchestrator** is needed to make sure all steps get executed in order, on time. 50 | 51 | ![Orchestration](/assets/mf-intro-05.png) 52 | 53 | --- 54 | 55 | ### 6. DS/ML applications evolve over time incrementally 56 | 57 | Rarely a real-world application is built and deployed only once. Instead, a typical application is built gradually, 58 | through contributions by many people. The project needs to be tracked, organized, and **versioned**, which enables systematic and 59 | continuous improvement over time. 60 | 61 | ![Versioning](/assets/mf-intro-06.png) 62 | 63 | --- 64 | 65 | ### 7. DS/ML applications produce business value in various ways 66 | 67 | To produce real business value, DS/ML applications can't live in a walled garden. They must be integrated with the surrounding systems seamlessly: Some applications enhance data in a database, some power internal dashboards or microservices, whereas some power user-facing products. There are many such ways to **deploy** ML in production. The more valuable the application, the more carefully it needs to be operated and monitored as well. 68 | 69 | ![Deployment](/assets/mf-intro-07.png) 70 | 71 | --- 72 | 73 | ### 8. DS/ML applications should leverage the best tools available 74 | 75 | For many data scientists and ML engineers, the most rewarding part of the project is **modeling**. Using their domain 76 | knowledge and expertise, the modeler should be able to choose the best tool for the job amongst off-the-shelf libraries, such as PyTorch, XGBoost, Scikit Learn, and many others. Or, if necessary, they should be able to use a wholly custom approach. 77 | 78 | ![Modeling](/assets/mf-intro-08.png) 79 | 80 | --- 81 | 82 | ### 9. Metaflow covers the full stack of DS/ML infrastructure 83 | 84 | Metaflow was originally created at Netflix, motivated by the realization that data scientists and ML engineers need 85 | help with all these concerns: Any gaps or friction in the stack can slow down the project drastically. Thanks to a 86 | common foundation 87 | provided by Metaflow, data scientists can iterate on ideas quickly and 88 | deploy them confidently by relying on a well-defined architecture and best practices, shared by everyone in the team. 89 | 90 | ![Full-stack Metaflow](/assets/mf-intro-09.png) 91 | 92 | --- 93 | 94 | ### 10. Metaflow takes care of the plumbing, so you can focus on the fun parts 95 | 96 | Metaflow provides a robust and user-friendly foundation for a wide spectrum of data-intensive applications, including most data science and ML use cases. Data scientists and ML engineers who know the basics of Python can build their own applications, models, and policies on top of it, while Metaflow takes care of the low-level infrastructure: data, compute, orchestration, and versioning. 97 | 98 | ![Full stack triangles](/assets/mf-intro-10.png) 99 | 100 | --- 101 | 102 | ### 11. Metaflow relies on systems that engineers know and trust 103 | 104 | Metaflow was designed at Netflix to serve the needs of business-critical ML/DS applications. It relies on proven and scalable infrastructure which works for small and large organizations alike. Metaflow integrates with all the 105 | top clouds as well as with Kubernetes and systems around them in a responsible manner. It respects the security and other 106 | policies of your company, making engineering teams happy too. 107 | 108 | ![Existing infrastructure](/assets/mf-intro-11.png) 109 | 110 | --- 111 | 112 | ### 12. Metaflow is used by hundreds of innovative companies 113 | 114 | Today, Metaflow powers thousands of ML/DS applications at innovative companies such as 115 | [Netflix](https://netflixtechblog.com/supporting-content-decision-makers-with-machine-learning-995b7b76006f), 116 | [CNN](https://medium.com/cnn-digital/accelerating-ml-within-cnn-983f6b7bd2eb), 117 | [SAP](https://blogs.sap.com/2022/04/20/train-your-model-in-sap-ai-core-using-the-metaflow-argo-plugin/), 118 | [23andMe](https://medium.com/23andme-engineering/machine-learning-eeee69d40736), 119 | [Realtor.com](https://medium.com/realtor-com-innovation-blog/improving-data-science-processes-to-speed-innovation-at-realtor-com-b6b90fa530dc), 120 | [REA](https://www.rea-group.com/about-us/news-and-insights/blog/accelerating-experimentation-with-mlops/), 121 | [Coveo](https://outerbounds.com/blog/dataops-mlops-reasonable-organizations/), 122 | [Latana](https://aws.amazon.com/blogs/startups/brand-tracking-with-bayesian-statistics-and-aws-batch/), 123 | and 124 | hundreds of others across industries. Commercial support for Metaflow is provided by [Outerbounds](https://outerbounds.com). 125 | To hear first-hand experiences from these companies and many others, [join the Metaflow Slack](http://slack.outerbounds.co). 126 | 127 | -------------------------------------------------------------------------------- /data/test-data/ob/blog/llm-tuning-metaflow.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Fine-tuning a Large Language Model using Metaflow, featuring LLaMA and LoRA" 3 | date: "2023-06-21T12:41:26" 4 | tags: ["AI", "Foundation Models", "LLM", "NLP"] 5 | keywords: ["AI", "Foundation Models", "LLM", "NLP"] 6 | categories: ["Foundation Models"] 7 | authors: [federico, obteam] 8 | description: "A workflow template built with Metaflow for fine-tuning LLMs for custom use cases." 9 | image: "/blog/cover/llm-tuning-metaflow.png" 10 | ogImage: "/blog/og/llm-tuning-metaflow.png" 11 | slug: "llm-tuning-metaflow" 12 | --- 13 | 14 | --- 15 | 16 | *This post demonstrates how you can use Metaflow to fine-tune a production-grade custom large language model (LLM) based on instruction tuning. We provide an open-source, fine-tuning workflow template that allows you to run models on local or cloud-based GPUs and connect the results to surrounding production systems using the complete toolchain of Metaflow.* 17 | 18 | [Federico](https://federicobianchi.io) is a post-doctoral researcher at Stanford University, working on NLP and Large Language Models. He frequently releases research projects as open-source tools that have collectively gathered thousands of [GitHub stars](https://github.com/vinid). 19 | 20 | --- 21 | 22 | Motivation 23 | ---------- 24 | Unsurprisingly, as of June 2023, building production systems that leverage custom large language models takes a lot of work and money. As usual in any novel technical field, one faces a diverse, quickly moving landscape of immature tooling, missing documentation, and vibrant but chaotic communities. Developers face complex and varying dependencies of LLMs, spanning hardware drivers and bleeding-edge software frameworks, which can take hours and days to figure out for any given use case. 25 | 26 | Some astute companies may decide to wait a few years for the field to mature. However, many companies need or want to start innovating faster, and mature tools like [Metaflow](https://metaflow.org/) can help. Metaflow helps you design robust workflows to support any data-intensive application, allowing you to amortize the fixed cost of setting up systems - such as LLM experimentation pipelines - over the project's life. Using Metaflow, teams can set up appropriate human and technical workflows for LLM development and deployment, allowing them to focus on the business's specific needs. 27 | 28 | Metaflow has been supporting serious ML use cases like this for years. The foundational needs of LLM development don’t differ much from other earlier models (besides scale!), so Metaflow is a natural fit for building systems around custom LLMs. 29 | 30 | Foundational infrastructure for LLMs 31 | ------------------------------------ 32 | If you just want to run the code and do not care about owning the supply chain or deploying the results to a production system, [Colab](https://colab.google/) and similar GPU-powered-notebook-as-a-service providers are a convenient way to access prototyping the needed infrastructure fast. Alternatively, projects like [GPT4All](https://gpt4all.io/index.html) make running LLM inference on a laptop without a GPU straightforward. 33 | 34 | However, as we discussed in [part one](/blog/custom-llm-tuning) of this series, there are cases where organizations that want to move beyond prototyping in notebooks and consider using LLMs in production need more substantial ML tooling and infrastructure: You need to consider how to connect models to surrounding systems, run training and inference and all supporting steps on a scalable compute platform, iterate and track models, code and data across versions, and have both technical and human workflows in place for continuous improvement and experiments. All this in business-critical systems with tight SLA, security, and compliance requirements. 35 | 36 | How Metaflow helps 37 | ------------------ 38 | This article shows how to use Metaflow to instruction-tune a custom LLM, accompanied by [the full stack of production-grade ML tooling](https://outerbounds.com/docs/infra-stack/) that Metaflow provides. We use a few lines of Metaflow code to access and monitor GPUs in CoreWeave’s cloud, making it easy to produce, track, and report results in shareable [Metaflow cards](https://docs.metaflow.org/api/cards) when running off-the-shelf open-source LLM code. 39 | 40 | If you are unfamiliar with the features of Metaflow, we encourage you to look at the [Metaflow documentation](https://docs.metaflow.org). Crucially, all the functionality Metaflow provides can be helpful when developing and deploying real-world systems powered by LLMs. 41 | 42 | For instance, Metaflow enables us to explore different model sizes and variants while quickly tuning infrastructure requirements and scheduling runs by changing single values in Python code. Switching between models helps us explore rapidly without unnecessarily wasting money on under-utilized GPUs. 43 | 44 | ![](../static/assets/huggingface-instruction-tuning.png) 45 | Top left: The lineage of the Alpaca-LoRA model used as a default in this post; Bottom: Steps in the Metaflow workflow; Top right: The output of the fine-tuning step is model checkpoints in HuggingFace 46 | 47 | As Metaflow runs any Python code, you can easily leverage state-of-the-art models and APIs from your favorite modeling framework. For instance, In the example workflow, we push checkpoints to the HuggingFace hub with a few lines of code. In our example, we followed the [HuggingFace documentation](https://huggingface.co/docs/transformers/main_classes/trainer#checkpoints) to save all the checkpoints to a local folder, using a specific pattern that is checkpoint-xxx/ where xxx is the number of steps. 48 | 49 | Using HuggingFace APIs with Metaflow makes it easy to create a model repo for each checkpoint, train on whatever hardware you need, and push the results back to the hub to use in an application and for downstream testing. Doing this and sharing incremental model checkpoints on teams can also hugely reduce costs. People need not recompute the same model training results and can instead readily pick up training from where a colleague left off. 50 | 51 | Instruction tuning with Metaflow 52 | -------------------------------- 53 | For example, let’s fine-tune a recent LLaMA variant, taking an already trained LLaMA language model and training it further on an instruction tuning dataset. The learning task is still language modeling, but the training dataset contains specific instruction-following examples. 54 | 55 | In this case, the model learns to do language modeling relevant to the instructions in the Alpaca dataset. You can imagine using any dataset, such as how the Databricks team created a custom instruction-tuning [dataset](https://huggingface.co/datasets/databricks/databricks-dolly-15k) in [Dolly v2](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm). You can find additional information about Metaflow and large language models in our [blog post on tuning Dolly](https://outerbounds.com/blog/train-dolly-metaflow/). 56 | 57 | ### Show me the code 58 | The core concepts of this workflow template are the following: 59 | 1. HuggingFace to access the upstream models and to checkpoint our fine-tuned models, 60 | 2. CoreWeave to access GPU resources, and 61 | 3. Metaflow to structure the Python code, run it on the GPUs (carefully monitoring resource utilization with a Metaflow card), and track versions of code and models. 62 | 63 | ![](../static/assets/coreweave-instruction-tuning.png) 64 | 65 | The following code shows how to call the main fine-tuning logic from the [Alpaca LoRA codebase](https://github.com/tloen/alpaca-lora). 66 | 67 | 68 | Use PyTorch, HuggingFace and Metaflow to build a workflow for fine-tuning LLMs. 69 | 70 | 71 | ```python 72 | # Contents of the flow.py file in the repository linked above. 73 | 74 | from metaflow import FlowSpec, step, Parameter, resources, environment 75 | from mixins import HuggingFaceLora, N_GPU, visible_devices 76 | from custom_decorators import pip, gpu_profile 77 | import os 78 | 79 | class LlamaInstructionTuning(FlowSpec, HuggingFaceLora): 80 | 81 | push_checkpoints = Parameter("-push", help="push checkpoints on huggingface", default=False, type=bool) 82 | 83 | @step 84 | def start(self): 85 | self.next(self.finetune) 86 | 87 | @environment(vars={ 88 | "CUDA_VISIBLE_DEVICES": visible_devices, 89 | "WORLD_SIZE": N_GPU, 90 | "HUGGINGFACE_TOKEN": os.environ["HUGGINGFACE_TOKEN"], 91 | "HF_ORGANIZATION": os.environ["HF_ORGANIZATION"] 92 | }) 93 | @gpu_profile(interval=1) 94 | @pip(file="requirements.txt") 95 | @resources(gpu=N_GPU, cpu=16, memory=128000) # tested with A100 and A6000 GPU. 96 | @step 97 | def finetune(self): 98 | self.run() 99 | if self.push_checkpoints: 100 | self.upload_to_huggingface() 101 | self.next(self.end) 102 | 103 | @step 104 | def end(self): 105 | pass 106 | 107 | if __name__ == '__main__': 108 | LlamaInstructionTuning() 109 | ``` 110 | 111 | [Metaflow](https://metaflow.org/) provides [a scaffolding for data science workflows](https://docs.metaflow.org/metaflow/introduction), all written in Python. In this case, the flow centers around the `finetune` step, where we use the [multiple inheritance pattern](https://realpython.com/lessons/multiple-inheritance-python/) to modularize the workflow, separating the Alpaca LoRA code that makes HuggingFace API calls from the Metaflow code that organizes the workflow. 112 | 113 | Specifically, in the `HuggingFaceLora` class, we define the logic to run the open-source LLM fine-tuning code in a [distributed data-parallel](https://pytorch.org/docs/stable/notes/ddp.html?highlight=distributed+data+parallel) manner using the [torchrun](https://pytorch.org/docs/stable/elastic/run.html) API. The LLM code is run inside a Metaflow task, and a data scientist can add single lines of Python code to layer complex functionality on each task, [like requesting precise GPU and memory resources on the cloud instance that runs the job](https://docs.metaflow.org/scaling/remote-tasks/introduction) and monitoring the resources to ensure they are appropriately sized. 114 | 115 | The key idea for data science teams is that once they have access to a Metaflow deployment, a data scientist who can feasibly write Python code can [write, run, and deploy production-grade workflows in a single work session](https://docs.metaflow.org/introduction/what-is-metaflow). Metaflow [persists and tracks all artifacts automatically](https://docs.metaflow.org/metaflow/basics#artifacts), [exposes them to notebooks](https://docs.metaflow.org/metaflow/client) and other programmatic use cases, and shows them on a UI. Once everything works locally, the workflow can be [deployed to production, connecting to other workflows both upstream and downstream](https://docs.metaflow.org/production/introduction), with a single click. 116 | 117 | If you are curious to test all these features in action, you can [do it easily in the Metaflow sandbox](https://outerbounds.com/sandbox). 118 | 119 | Models 120 | ------ 121 | We fine-tuned four of the recent LLaMA models on the same dataset with a fixed computing budget for each model; we used [Low-Rank Adaptation](https://arxiv.org/abs/2106.09685), making use of the recent [Alpaca LoRA repository](https://github.com/tloen/alpaca-lora). The models we fine-tuned are the 7B, 13B, 33B, and 65B parameters models, with the idea that larger models should provide better performance and answers. We saved multiple checkpoints for each model and chose the best one by validation loss after training. 122 | 123 | We chose the Alpaca dataset - built by generating instructions from GPT-3. We use a [ version](https://huggingface.co/datasets/yahma/alpaca-cleaned) of the dataset in which some errors are corrected. The dataset contains 50K instructions. You can find more details about how we used Metaflow in the practical considerations section. 124 | 125 | ### Quantitative evaluation 126 | We wanted to assess if our fine-tuned models are still performing as expected after fine-tuning. While there is still active discussion about how to evaluate instruction and chat models (see the [ChatBot Arena](https://lmsys.org/blog/2023-05-03-arena/), for example), we decided to evaluate our models against standard LLM benchmarks as a more customized evaluation approach is beyond the scope of this post. Specifically, we make use of the [LM-Harness](https://github.com/EleutherAI/lm-evaluation-harness) package developed by EleutherAI to evaluate our fine-tuned models on these different tasks in a zero-shot manner. We chose three question-answering benchmarks: [PIQA](https://huggingface.co/datasets/piqa) and [BoolQ](https://huggingface.co/datasets/boolq), and [OpenBookQA](https://huggingface.co/datasets/openbookqa). 127 | 128 | Here is an example from PIQA (the other datasets are similar): 129 | 130 | > “To separate egg whites from the yolk using a water bottle, you should…” 131 | 132 | The dataset has two possible answers: 133 | 134 | > “Squeeze the water bottle and press it against the yolk. Release, which creates suction and lifts the yolk.” 135 | 136 | > “Place the water bottle and press it against the yolk. Keep pushing, which creates suction and lifts the yolk.” 137 | 138 | Given that this is a multiple choice question task, [LM-Harness](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md#multiple-choice-format) runs the evaluation by comparing the likelihood of the label words. 139 | 140 | So, how well do these models work? Let’s look at some results (the evaluation metric here is accuracy). 141 | 142 |
143 |
144 | 145 | | Model | PIQA | BOOLQ | OPENBOOKQA | 146 | | ----- | ---- | ----- | ---------- | 147 | | LLAMA 7B | 0.7916 | 0.7511 | 0.4440 | 148 | | ALPACA 7B | 0.8030 | 0.7835 | 0.4420 | 149 | | LLAMA 13B | 0.8003 | 0.7798 | 0.4480 | 150 | | ALPACA 13B | 0.8030 | 0.8150 | 0.4620 | 151 | 152 |
153 |
154 | 155 | These results seem coherent with what was expected based on various leaderboards (see [LLaMA results](https://paperswithcode.com/paper/llama-open-and-efficient-foundation-language-1), for example) and papers. There is not a notably high increase in performance, as we expected. 13B parameters models do slightly better than their 7B counterparts. We do not expect a substantial increase in performance with respect to the original LLaMA models; instead, we would like to maintain similar performances on these benchmarks. Instruction tuning should give us models that follow instructions better and do not perform worse on general benchmarks. Our evaluation shows we have nicely performing models on the latter, but do they follow instructions? 156 | 157 | ### Qualitative evaluation 158 | When working with LLMs, being mindful that the research field is new and changing rapidly is essential. Therefore, when considering using this technology in a production environment, we need to evaluate qualitatively and measure performance against quantitative benchmarks, which may need significant iteration and revision for assessing performance in a way that makes sense in any given production context. 159 | 160 | Even with the small training dataset we used and only three epochs of training, our models already see exciting results. The bigger the model is, the better (and more valuable) its generations become. 161 | 162 |
163 | 164 | | Instruction | 7B | 30B | 165 | | ----------- | -- | --- | 166 | | **Write a poem about capybaras** | Capybaras, the largest rodent in the world,



With their shaggy coats and bushy tails,



They roam the grasslands of South America,



In search of food and water to quench their thirst.



Capybaras are herbivores, and they love to graze,



Eating grasses, fruits, and vegetables, too,



They are social creatures, living in herds,



And they're known for their playful and friendly deeds. | A Capybara's Life



A capybara's life is filled with joy,

As they frolic in the grass and play in the sun.



They're the largest rodents in the world,



With bodies built for swimming and for fun.



They're social creatures, always in a group,



Enjoying each other's company, without a flop.



They love to eat grass and aquatic plants,



A diet that keeps them healthy and well-fed. | 167 | | **Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering.** (From Flan) | No, it is not possible for Geoffrey Hinton to have a conversation with George Washington. This is because George Washington is a historical figure who lived from 1732 to 1799, while Geoffrey Hinton is a living person who was born in 1957. Time travel is not possible, so it is not possible for two people from different eras to have a conversation with each other. | No, it is not possible for Geoffrey Hinton to have a conversation with George Washington. Geoffrey Hinton is a living person who was born in 1947, while George Washington died in 1799. It is not possible for a living person to have a conversation with someone who has been dead for more than 200 years. | 168 | | **Classify this document in one of these categories, "sport", "politics", or "technology".**



**“The LLaMA models are a family of Natural Language Processing LLMs that can be applied to many different tasks.”** | technology | technology | 169 | | **Do aliens invest in stocks?** | No, aliens do not invest in stocks. Stock investing is a human activity that involves buying and selling shares of publicly traded companies in order to profit from changes in their share prices. Aliens do not have access to the same financial markets and systems that humans do, so they would not be able to participate in stock investing.



However, it is possible that aliens could invest in other types of financial instruments, such as bonds, mutual funds, or real estate, if they had access to the appropriate financial markets and systems. | I'm sorry, but as an AI language model, I don't have access to any information about aliens or their investment habits. It's possible that some alien civilizations may invest in stocks, but it's also possible that they may have different investment strategies or may not invest in stocks at all. It's difficult for me to give a definitive answer to this question without more information or data. It's possible that some alien civilizations may invest in stocks, but it's also possible that they may have different investment strategies. | 170 | | **To separate egg whites from the yolk using a water bottle, you should…** (from PIQA) | To separate egg whites from the yolk using a water bottle, you should follow these steps:



1. Crack the egg into a small bowl and separate the yolk from the egg white.

2. Fill the water bottle with cold water.

3. Place the egg white into the water bottle.

4. Shake the water bottle vigorously to separate the egg white from the yolk.

5. Remove the egg white from the water bottle and discard the yolk.

6. Repeat the process for the remaining eggs. | To separate egg whites from the yolk using a water bottle, you should follow these steps:



1. Start by cracking an egg into a small bowl or cup.

2. Hold the water bottle upside down over the bowl or cup.

3. Slowly squeeze the water bottle, allowing the water to flow into the bowl or cup.

4. As the water flows into the bowl or cup, it will push the egg yolk towards the bottom of the bowl or cup. | 171 | | **Best way to pierce ears** (from PIQA)| The best way to pierce ears is to use a sterilized, hygienic piercing gun or needle. It is important to follow the instructions provided by the piercing professional and to clean the area thoroughly before and after the piercing. It is also important to use a high-quality, non-allergenic jewelry and to take proper care of the piercing by cleaning it regularly and avoiding contact with water while the piercing is healing. It is recommended to wait at least 6-8 weeks for the piercing to heal completely before changing the jewelry. | The safest and most effective way to pierce your ears is to have it done by a professional piercer at a reputable piercing studio. A professional piercer will use sterilized equipment and follow strict hygienic procedures to minimize the risk of infection or other complications. They will also be able to advise you on the best placement and type of jewelry for your piercing, as well as provide aftercare instructions to ensure proper healing. | 172 | | **What's the best way to drink water if you are juggling three balls?** | If you are juggling three balls, the best way to drink water is to use a straw. This will allow you to keep your hands free to continue juggling, while also ensuring that you don't spill any water on yourself or your juggling equipment. Additionally, it's important to drink small sips of water, rather than gulping down large amounts at once. This will help you to stay hydrated without interfering with your juggling routine. | If you are juggling three balls, the best way to drink water would be to take a break from juggling, set the balls down on a safe surface, and take a sip of water from a cup or bottle. Juggling and drinking at the same time can be difficult and may result in spilling water or dropping the balls. It's important to stay hydrated while juggling, but it's best to do so in a way that doesn't interfere with your juggling performance. | 173 | 174 |
175 | 176 | These results show multiple things: first off, the models are good at replying to questions and instructions. They all come up with coherent text that appears correct. The 7B model is more prone to generating incorrect stuff, such as getting the wrong year of birth of someone or suggesting something that might be unsafe. We have color coded some replies to highlight possible good (green), wrong (red), or unclear (orange) answers. 177 | 178 | Summary 179 | ------- 180 | You can use the above Metaflow workflow as a generic template for fine-tuning models available on HuggingFace. Besides the Alpaca instruction dataset we used to fine-tune various sizes of LLMs, you can apply the same approach to fine-tune models with your instructions to optimize models for your use cases. We are happy to help you get started with this template and Metaflow in general. Join us and thousands of other ML engineers, AI enthusiasts, and data scientists on [Metaflow Community Slack](http://slack.outerbounds.co/)! 181 | 182 | Acknowledgments 183 | --------------- 184 | We would like to thank Mert Yuksekgonul, Mirac Suzgun, Patrick John Chia, Silvia Terragni, Giuseppe Attanasio, and Jacopo Tagliabue for feedback and suggestions on a previous version of this blog post. 185 | 186 | -------------------------------------------------------------------------------- /data/test-data/ob/blog/metaflow-fast-data.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Fast Data: Loading Tables From S3 At Lightning Speed" 3 | date: "2023-05-05T20:13:16" 4 | tags: ["data", "MLOps"] 5 | keywords: ["data", "MLOps"] 6 | categories: ["Data", "Best Practices"] 7 | authors: [obteam] 8 | description: "Load and process tabular data at lightning speed using Apache Arrow and Metaflow." 9 | slug: "metaflow-fast-data" 10 | image: "/blog/cover/metaflow-fast-data.png" 11 | ogImage: "/blog/og/metaflow-fast-data.png" 12 | --- 13 | 14 | --- 15 | 16 | *This article answers a common question: How can I load tabular data quickly from S3 into a Metaflow workflow and process it efficiently? Thanks to Metaflow’s straightforward support for vertical scalability, its high-throughput S3 client, and high-performance tooling powered by Apache Arrow, you can handle surprisingly large amounts of data, surprisingly fast.* 17 | 18 | *You can execute all benchmarks and examples with [the code in this repository](https://github.com/outerbounds/fast-data-blog/).* 19 | 20 | --- 21 | 22 | Ever since the early days of Metaflow at Netflix, we have been asking a fundamental question from data scientists and other users of Metaflow: What is the greatest source of friction in your day to day work? Maybe unsurprisingly, the majority of answers relate to data: Finding the right data, accessing data, transforming data, and utilizing data in various ways. 23 | 24 | In contrast to traditional software, which has a limited surface area with the outside world, data exposes ML/AI-powered applications to constantly changing real-world entropy and complexity. It would be unreasonable to expect that we can address the diverse use cases with a single nifty solution. Needs differ greatly, say, between computer vision, NLP, large tabular data, and small datasets. 25 | 26 | We can support a wide variety of use cases by working well with a broad set of open-source data tools and efficient infrastructure. This approach is illustrated by our view of [the full stack of ML infrastructure](https://outerbounds.com/docs/infra-stack/) which acknowledges the need of providing plenty of support for data: 27 | 28 | ![](_metaflow_fast_data/fast-data-stack.png) 29 | 30 | Metaflow doesn’t provide magical new abstractions or paradigms for processing data, but it provides foundational, human-friendly infrastructure that works smoothly with the tools of your choice. Data scientists certainly care about 31 | data but data tooling should just work without hassle. Often simpler is better. 32 | 33 | This article focuses on a common use case of data: tabular, structured datasets. This use case is particularly interesting, as tooling for it has evolved rapidly over the past few years. 34 | 35 | 36 | A fast path for tabular data 37 | ---------------------------- 38 | 39 | All modern data warehousing solutions either store data natively as Parquet-encoded files (like various versions of [the cloud-based data lake](https://towardsdatascience.com/an-aws-data-lake-with-s3-explained-c67c5f161db3) e.g. powered by [AWS Athena](https://aws.amazon.com/athena/) or [Spark](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark.html)), or they allow exporting/unloading data as Parquet files (e.g. [Snowflake](https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet) or [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html)). 40 | 41 | Conveniently, these systems allow you to create materialized views which can be processed outside the warehouse. You can use SQL and [the Create-Table-As-Select (CTAS) pattern](https://docs.aws.amazon.com/athena/latest/ug/ctas.html) to query even petabytes of data, extracting the results for downstream processing in Metaflow. 42 | 43 | Once you have Parquet files in S3, you can hit the fast path: 44 | 45 | ![](_metaflow_fast_data/arrow-diagram.png) 46 | 47 | 1. You can load data from S3 directly to memory *very quickly*, at tens of gigabits per second or more, using [Metaflow’s optimized S3 client, `metaflow.S3`](https://docs.metaflow.org/scaling/data). 48 | 49 | 2. Once in memory, Parquet data can be decoded efficiently using [Apache Arrow](https://arrow.apache.org/). 50 | 51 | 3. The in-memory tables produced by Arrow are interoperable with various modern data tools, so you can use the data in various ways without making additional copies, which speeds up processing and avoids unnecessary memory overhead. 52 | 53 | This pattern is becoming increasingly popular, thanks to the maturation of Apache Arrow itself and versatile tools around it. The excitement is further motivated by the realization that a single medium-size EC2 instance can handle an amount of data that used to require a Hadoop or Spark cluster just a decade ago - this time with much fewer operational headaches, and at a much lower cost. 54 | 55 | The pattern is still new, so not everyone is fully aware of how well and fast it works - under suitable conditions. To showcase this, we first focus on the left side of the diagram, addressing a common myth that loading data from S3 is slow. After this, we focus on the right side, demonstrating how to use the data with various high-performance tools efficiently. 56 | 57 | S3 is fast (when used correctly) 58 | -------------------------------- 59 | 60 | Occasionally, we are asked how to cache data from S3 on local disk, instance volume, or a distributed file system like EFS to make loading and processing data faster. Counterintuitively, loading data from S3 can be much faster than local disk, so there's no need to add extra layers of complexity. 61 | 62 | The question likely stems from first-hand experiences of S3 being slow, which can easily happen if you don’t have a setup that leverages its strengths. This benchmark illustrates the point: 63 | 64 | ![](_metaflow_fast_data/load_s3_ec2_chart.png) 65 | 66 | The bars refer to a Metaflow task that uses `metaflow.S3` to download the dataset, running on EC2 with AWS Batch [using the @batch decorator](https://docs.metaflow.org/scaling/remote-tasks/aws-batch), comparing four different configurations: 67 | 68 | - **Small (disk)**: 8GB RAM, 2 CPU cores, a c4.2xlarge instance with EBS. 69 | - **Medium (disk)**: 8GB RAM, 8 CPU cores, a c4.8xlarge instance with EBS. 70 | - **Medium (tmpfs)**: 30GB RAM, 8 CPU cores, a c4.8xlarge utilizing Metaflow’s new `tmpfs` feature. 71 | - **Large (tmpfs)**: 200GB RAM, 32 CPU cores, an m5n.24xlarge utilizing Metaflow’s new `tmpfs` feature. 72 | 73 | There is a 35x fold speed difference between the slowest and the fastest configuration! Understandably, if your tasks ran under the first configuration, you would conclude that S3 is slow. 74 | 75 | ### Maximizing the S3 performance 76 | 77 | A key observation is that the download throughput of S3 is a combination of multiple factors. In order to achieve the maximum throughput, pay attention to the following dimensions: 78 | 79 | - **Same region**: Make sure the EC2 instances hosting the tasks are located in the same region as the S3 bucket you are loading data from. 80 | - **File layout**: You need to download multiple files in parallel using e.g. [metaflow.S3.get_many](https://docs.metaflow.org/api/S3#S3.get_many). The files should be around 0.1-1GB each. Fortunately, it is easy to produce partitioned outputs like this with many query engines. 81 | - **Instance size**: Larger EC2 instances boost higher number of CPU cores, network throughput, and memory. When using e.g. `@batch` with Metaflow, instances autoscale based on demand, so faster processing times can lead to lower costs, despite the higher unit costs of larger instances. 82 | - **Data fits in RAM**: Crucially, loading data from S3 directly to memory is faster than loading data from S3 to an instance volume. If data doesn’t fit in memory, performance can be very bad due to slow local disk IO. 83 | 84 | The importance of the last point is demonstrated by the two medium bars: By increasing the amount of memory available for the tasks - simply by setting `@resources(memory=32000)` - we gain an eightfold increase in S3 throughput! 85 | 86 | The highest bar demonstrated the massive horsepower available on the largest instances: By using `metaflow.S3`, we can reach a beast-mode throughput of 32 GBit/s, which can easily beat local SSDs on laptops. In other words, loading data from S3 to an EC2 instance can be faster than loading data locally on a laptop, or loading it from an EBS volume, an ephemeral instance disk, or EFS. 87 | 88 | New feature: tmpfs support for Batch and Kubernetes 89 | --------------------------------------------------- 90 | 91 | We have been using the above fast data pattern successfully for years to power Metaflow workflows handling terabytes of data. However, this far, there has been a small gotcha. 92 | 93 | While allocating enough memory through `@resources` should guarantee that data can be downloaded quickly, we still rely on the local filesystem to back the files nominally, although the files never hit the disk thanks to caching. Sadly, it is not possible to allocate disk space for tasks on the fly while using AWS Batch. Doing it requires a bit more tedious change in [the instance’s launch template](https://docs.aws.amazon.com/batch/latest/userguide/launch-templates.html). 94 | 95 | To address this issue, recently we implemented support for memory-based `tmpfs` filesystem on Batch and Kubernetes in Metaflow. You can use the feature to create an ephemeral filesystem backed by memory on the fly, without having to change anything on the infrastructure side. 96 | 97 | We made sure that the `metaflow.S3` client is aware of the `tmpfs` volume, so it will automatically use it to speed up downloads when you enable it. To benefit from the new feature, simply add 98 | 99 | ```python 100 | @batch(use_tmpfs=True) 101 | ``` 102 | 103 | for your AWS Batch workloads or 104 | 105 | ```python 106 | @kubernetes(use_tmpfs=True) 107 | ``` 108 | for Kubernetes. There are a few additional features related to `tmpfs` which you can [read more about in the Metaflow documentation](https://docs.metaflow.org/scaling/data#using-metaflows3-for-in-memory-processing). 109 | 110 | 111 | From cloud to table 112 | ------------------- 113 | 114 | As shown above, we can load bytes to memory very quickly. To actually use the data, we must also decode the Parquet files to [an in-memory `pyarrow.Table` object](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html). When taking the decoding time into account, the differences are even more striking: 115 | 116 | ![](_metaflow_fast_data/decode_parquet_chart.png) 117 | 118 | The configurations are the same as in the first chart above. We added a top-of-the-line M2 Macbook Pro (the purple bar) as a comparison point. The Macbook loads data from a local SSD, not from S3 as the instances do. 119 | 120 | As shown above, a powerful Macbook beats small and medium-size instances when the instances are using a local disk, but even a medium-size instance beats the Macbook when using in-memory processing. In this case, a large instance can load a table about 8x faster than a Macbook. 121 | 122 | This result is a combination of multiple factors: The larger instances have more CPU cores, more memory, and more IO bandwidth available than a Macbook. Naturally, you must utilize the resources efficiently to gain the maximum performance: Use [metaflow.S3.get_many](https://docs.metaflow.org/api/S3#S3.get_many) to maximize network throughput and multithreaded decoding to benefit from all the available CPU cores, as exemplified by [the `table_loader.py` utility module](https://github.com/outerbounds/fast-data-blog/blob/main/table_loader.py). 123 | 124 | 125 | From table to results 126 | --------------------- 127 | 128 | With [a `pyarrow.Table` object](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html) in memory, the possibilities are many! We highlight three common use cases below. 129 | 130 | Crucially, all the libraries listed below support zero-copy instantiation from a `pyarrow.Table`, so you don’t have to reserve a large amount of extra `@resources(memory=)` just to handle extra copies or inefficient internal representations. Ballooning memory overhead used to be a problem in the past with older Pandas, prior to Arrow. 131 | 132 | 133 | You can code along with the following sections using [this repository](https://github.com/outerbounds/fast-data-blog/). 134 | 135 | ### Dataframes 136 | 137 | Moving to Pandas is trivial and fast: 138 | 139 | ```python 140 | df = table.to_pandas() 141 | ``` 142 | 143 | You can add a `zero_copy_only=True` flag to make sure that you will get warned if the conversion requires extra copies of data. Since [the Pandas 2.0 release](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i) that came out just a month ago, Pandas is fully compatible with Arrow data types, so working with Arrow and Pandas should be smooth sailing. 144 | 145 | While Pandas provides a familiar API, its operations are not always the fastest. You can often enjoy a higher performance by using [Polars - a high-performance dataframe library](https://pola.rs): 146 | 147 | ```python 148 | import polars as pl 149 | df = pl.from_arrow(table) 150 | ``` 151 | 152 | ### SQL 153 | 154 | Sometimes it is more convenient to use SQL for data processing than dataframe APIs. For instance, you can follow this 155 | pattern to enable clear division of responsibilities between data engineers and data scientists: 156 | 157 | ![](_metaflow_fast_data/data-flow.png) 158 | 159 | Diagram adapted from the book, Effective Data Science Infrastructure 160 | 161 | 162 | In this scenario, data engineers are responsible for maintaining company-wide, authoritative fact tables that individual projects can rely on. Projects may create their own interpretations of data through their own lightweight ETL process e.g. for feature transformations. 163 | 164 | The project-specific ETL can run within their Metaflow workflows using [a small but surprisingly powerful in-process database, DuckDB](https://duckdb.org/). A benefit of this approach is that data scientists can own and iterate on their specific ETL very quickly, even testing it locally on their workstations. 165 | 166 | Using this pattern is not harder than using a dataframe: 167 | 168 | ```python 169 | import duckdb 170 | query = "SELECT * FROM arrow_table" 171 | relation = duckdb.arrow(arrow_table) 172 | df = relation.query('arrow_table', query).to_df() 173 | ``` 174 | 175 | To familiarize with using DuckDB and Metaflow together in a machine learning context, you can follow along with this [recommender systems tutorial](https://outerbounds.com/docs/recsys-tutorial-L2/). 176 | 177 | ### Interfacing with ML libraries 178 | 179 | Many modern ML libraries are well optimized when it comes to utilizing CPU and GPU resources. However, to benefit 180 | from high-performance training and inferencing, you need to be able to feed data to the model fast. 181 | 182 | Luckily, Arrow data is [readily convertible to Pandas and NumPy](https://arrow.apache.org/docs/python/numpy.html) which are supported by all major ML libraries. [Through NumPy, you can also convert data to various tensor objects](https://outerbounds.com/docs/python-tabular-data-structures/). 183 | 184 | To illustrate the pattern, [we included a simple ML example](https://github.com/outerbounds/fast-data-blog/blob/main/fast_data_modeling.py) that shows how to convert an Arrow table to a Pandas dataframe, and feed this object to [a LightGBM model](https://lightgbm.readthedocs.io/). The flow trains a model to predict a target variable based on 300 finance instruments. 185 | 186 | You can run the code easily by yourself and [visualize the results through a Metaflow card](https://docs.metaflow.org/metaflow/visualizing-results), [like this one](https://cards.outerbounds.dev/index.html?id=pw-1347686565%2F0e5a2797fce749c5): 187 | 188 | 189 | 190 | Summary 191 | ------- 192 | 193 | This post outlined patterns that you can use to load tabular data from S3 to your Metaflow workflows quickly and process it in various ways using Arrow-compatible libraries. 194 | 195 | In future posts, we will dive deeper into related topics around data: How to use data engineering tools like `dbt` together with Metaflow, how to handle large datasets in a horizontally scalable manner using the patterns highlighted here, and how to make sure that the pattern comply with your data governance policies. 196 | 197 | You can test patterns highlighted here in [the Metaflow sandbox](https://outerbounds.com/sandbox). If you have any 198 | feedback, questions, or other thoughts around data, [join us and thousands of other data scientists and engineers on Metaflow Slack](http://slack.outerbounds.co)! 199 | 200 | --- 201 | 202 | PS. If open-source data topics pique your interest, join [our upcoming event on The Open-Source Modern Data Stack on June 7th.](https://www.eventbrite.com/e/the-open-source-modern-data-stack-tickets-630584333717) 203 | 204 | 205 | 206 | ![](../static/assets/tabular-fs-chat-event.png) 207 | 208 | -------------------------------------------------------------------------------- /data/test-data/ob/docs/data-science/compute/use_multiple_cpu_cores.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Use Multiple CPU Cores 3 | slug: /docs/use-multiple-cpu-cores 4 | tags: [versioning] 5 | sidebar_label: 🍃  Use Multiple Cores 6 | id: use-multiple-cpu-cores 7 | pagination_next: null 8 | pagination_prev: null 9 | description: How can I use multiple CPU cores inside of Metaflow tasks? 10 | category: data science 11 | hide_table_of_contents: false 12 | --- 13 | 14 | 15 | 16 | 17 | ## Question 18 | How can I use multiple CPU cores within Metaflow tasks? 19 | 20 | ## Answer 21 | You can use multiple cores like you normally would in your Python code in a Metaflow task. 22 | For example, you can use Python's [concurrent.future](https://docs.python.org/3/library/concurrent.futures.html) module to launch multiple processes and threads. 23 | 24 | 25 | 26 | ### Write Flow with Concurrency in a Task 27 | 28 | 29 | 30 | 31 | This flow shows how to: 32 | - Call the `do_task` function four times on different cores using Python's built-in `concurrent.futures.ThreadPoolExecutor`. 33 | - Print the total time elapsed to demonstrate the tasks ran in parallel. 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | ```py title="use_multiple_cpu_cores.py" 42 | from metaflow import FlowSpec, step, Parameter 43 | from concurrent import futures 44 | import time 45 | 46 | def do_task(name): 47 | time.sleep(3) # a long-running task 48 | return name 49 | 50 | class MulticoreFlow(FlowSpec): 51 | 52 | #highlight-next-line 53 | num_cores = Parameter('num-cores', default=4) 54 | 55 | @step 56 | def start(self): 57 | threads = [ 58 | "thread_%s" % i 59 | for i in range(self.num_cores) 60 | ] 61 | t0 = time.time() 62 | #highlight-start 63 | with futures.ThreadPoolExecutor( 64 | max_workers = self.num_cores 65 | ) as exe: 66 | self.messages = [ 67 | msg for msg in exe.map(do_task, threads) 68 | ] 69 | #highlight-end 70 | self.time_elapsed = time.time() - t0 71 | self.next(self.end) 72 | 73 | @step 74 | def end(self): 75 | print_msg = "All tasks completed in %.3fs" 76 | print(print_msg % self.time_elapsed) 77 | [print(msg) for msg in self.messages] 78 | 79 | if __name__ == '__main__': 80 | MulticoreFlow() 81 | ``` 82 | 83 | 84 | 85 | ### Run Flow 86 | 87 | 88 | 89 | ```bash 90 | python use_multiple_cpu_cores.py run 91 | ``` 92 | 93 | 94 | 95 | ``` 96 | Workflow starting (run-id 1663278003721691): 97 | [1663278003721691/start/1 (pid 47937)] Task is starting. 98 | [1663278003721691/start/1 (pid 47937)] Task finished successfully. 99 | [1663278003721691/end/2 (pid 47941)] Task is starting. 100 | [1663278003721691/end/2 (pid 47941)] All tasks completed in 3.011s 101 | [1663278003721691/end/2 (pid 47941)] thread_0 102 | [1663278003721691/end/2 (pid 47941)] thread_1 103 | [1663278003721691/end/2 (pid 47941)] thread_2 104 | [1663278003721691/end/2 (pid 47941)] thread_3 105 | [1663278003721691/end/2 (pid 47941)] Task finished successfully. 106 | Done! 107 | ``` 108 | 109 | 110 | 111 | 112 | 113 | 114 | ## Further Reading 115 | * [Parallelization Over Multiple Cores](https://docs.metaflow.org/metaflow/scaling-out-and-up/effortless-scaling-with-kubernetes#parallelization-over-multiple-cores) 116 | * [How to Scale Model Training and Tuning to GPU](/docs/scale-model-training-and-tuning/) 117 | -------------------------------------------------------------------------------- /data/test-data/ob/docs/data-science/orchestration/dags_for_data_science.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Why Should I Care About DAGs and Workflows in Data Science? 3 | slug: /docs/dags-in-data-science 4 | tags: [dag, orchestration] 5 | sidebar_label: 🌳  Why DAGs? 6 | id: dags-in-data-science 7 | pagination_next: null 8 | pagination_prev: null 9 | description: The complexity of machine learning and data science workflows can become complicated quickly. To avoid a jungle of data pipelines and models, we need organizational principles for workflows, and directed acyclic graphs have become a standard for this 10 | category: data science 11 | hide_table_of_contents: false 12 | --- 13 | 14 | 15 | 16 | 17 | ## DAGs and Data Science Workflows 18 | 19 | The complexity of machine learning and data science workflows can become complicated very quickly. To avoid a jungle of data pipelines and models, we need organizational principles for our workflows, and directed acyclic graphs aka DAGs have become a standard for this: 20 | 21 | ![](/assets/dags-in-data-science-01.png) 22 | Common data science DAG 23 | 24 | DAGs don’t only exist to tame complexity in the abstract, but are practically useful for several reasons: 25 | 26 | They provide a common vocabulary of steps (the nodes) and transitions between them (the edges), which make it easier to write and understand nontrivial applications that are structured as DAGs. 27 | They both encourage and force us to be explicit about the order in which our workflows are executed. This is particularly useful when the order is anything more complicated than a linear flow, such as you would see in a notebook. Being explicit about the order of operations helps us to manage what could be a jungle of models and data pipelines. 28 | Using DAGs, we can signal when the order of operations doesn’t matter, such as when training independent models. We can parallelize these operations automatically, which is essential for performant code. 29 | 30 | In summary, it is helpful to view DAGs as a language, not a programming language per se, but rather a formal construct for human-to-human communication. With DAGs, we can speak about complex sequences of operations in a concise and understandable manner. 31 | 32 | ## What exactly is DAG? 33 | 34 | It is worth defining explicitly what a DAG is: Directed Acyclic Graph may sound like a mouthful but you can think of DAGs as follows: 35 | 36 | * A graph is a set of nodes connected by edges 37 | * A graph is directed if each edge is an arrow (that is, pointing from one node to another) 38 | * Acyclic means there are no cycles 39 | 40 | The nodes of the DAG of a data science or machine learning workflow are the steps in your workflow, such as (but not limited to) getting your data, training a model, or choosing the best model, as in the DAG above. In the case of Metaflow, a step can contain any Python code - like a cell in a notebook. The edges tell you which step(s) follow from other steps and this is why they need to be directed. The reason we don’t want cycles may now be clear: we don’t want our pipelines to end up in infinite loops! 41 | 42 | In an industrial setting, DAGs can be and often are far more complicated than the toy case above: 43 | 44 | ![](/assets/dags-in-data-science-02.png) 45 | DAGs expand quickly in data science workflows 46 | 47 | To be explicit here, DAGs are important in order to tame the complexity of machine learning workflows. There is an important corollary: not all data science workflows are DAGs, rather, DAGs are a particular type of workflow. 48 | 49 | 50 | ## Orchestrating Workflows 51 | 52 | Once we have a DAG, in order to perform our computation, we require a workflow orchestrator. The workflow orchestrator needs to perform a seemingly simple task: given a workflow or DAG definition, execute the steps defined by the graph in order. The orchestrator walks through the graph, sending each step to a separate compute layer that is responsible for executing the code contained in a step. For data-intense, distributed workflows a compute layer like Spark or Kubernetes can be used, whereas local processes suffice for prototyping. 53 | In contrast to prototyping, production workflows need to run automatically and reliably without human intervention. Workflow orchestrators play key roles in ensuring that production workflows, such as those training models and producing predictions regularly, execute in a scalable and highly available manner. Hence it makes sense to use a battle-hardened, proven system in production such as: 54 | * [Argo](https://argoproj.github.io/), a modern orchestrator that runs natively on Kubernetes 55 | * [Airflow](https://airflow.apache.org/), a popular open-source workflow orchestrator 56 | * or a low-maintenance managed solution such as [Google Cloud Composer](https://cloud.google.com/composer) or [AWS Step Functions](https://aws.amazon.com/step-functions/?step-functions.sort-by=item.additionalFields.postDateTime&step-functions.sort-order=desc) 57 | 58 | ## Developing Workflows 59 | Earlier, data science applications were often developed as independent modules which were glued together as a DAG afterwards, sometimes even by a separate team. As a result, it was hard to ensure that the whole application worked correctly end to end. Debugging workflow failures was often a painful experience as issues couldn’t be easily reproduced and fixed locally. 60 | A better approach is to consider the whole workflow as a key deliverable of a data science project, which is developed, tested, and deployed as a functional unit. When using a local orchestrator like the one provided by Metaflow, you can rapidly prototype end-to-end workflows almost as easily as how you develop code in a notebook. After the workflow works locally, it can be deployed to a production orchestrator without any changes in the code. After all, a data science workflow is just a DAG of steps that can be orchestrated by many different systems. 61 | 62 | 63 | 64 | ## How do I? 65 | 66 | [Pass artifacts between steps in Metaflow flows](/docs/pass-artifacts-between-steps) 67 | 68 | --- 69 | 70 | [Schedule Metaflow flows on AWS Step Functions](/docs/schedule-flow-on-step-functions) 71 | 72 | --- 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: rag-demo 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python 6 | - pip 7 | - pandas 8 | - metaflow 9 | - matplotlib 10 | - seaborn 11 | - jupyter 12 | - pip: 13 | - sentence-transformers 14 | - openai 15 | - lancedb 16 | - python-frontmatter 17 | - gitpython 18 | - git+https://github.com/outerbounds/rag-demo 19 | - python-slugify 20 | - tldextract 21 | - llama-index==0.8.0 22 | - llama-cpp-python==0.1.77 23 | - sentence-transformers 24 | - openai 25 | - llama-index 26 | - lancedb 27 | - pinecone-client[grpc] 28 | - streamlit 29 | # - fashion-clip -------------------------------------------------------------------------------- /flows/config/repo_params.py: -------------------------------------------------------------------------------- 1 | SAMPLE_OSS_MARKDOWN_REPOS = [ 2 | { 3 | "deployment_url": "docs.metaflow.org", 4 | "repository_path": "https://github.com/Netflix/metaflow-docs", 5 | "repository_ref": "master", 6 | "base_search_path": "docs", 7 | "exclude_paths": ["docs/v"], 8 | "exclude_files": ["README.md", "README"], 9 | }, 10 | { 11 | "deployment_url": "https://huggingface.co/docs/accelerate", 12 | "repository_path": "https://github.com/huggingface/accelerate", 13 | "repository_ref": "main", 14 | "base_search_path": "docs", 15 | "exclude_paths": [], 16 | "exclude_files": [], 17 | } 18 | ] -------------------------------------------------------------------------------- /flows/data_table_processor.py: -------------------------------------------------------------------------------- 1 | from metaflow import FlowSpec, step, Parameter, card, current, Flow, trigger_on_finish 2 | from metaflow.cards import Image, Markdown 3 | from matplotlib import pyplot as plt 4 | import seaborn as sns 5 | 6 | sns.set_style("dark") 7 | COLORS = { 8 | "purple": "#98A1E1", 9 | "light-purple": "#DADEFB", 10 | "gold": "#F0C054" 11 | } 12 | 13 | 14 | @trigger_on_finish(flow='MarkdownChunker') 15 | class DataTableProcessor(FlowSpec): 16 | 17 | save_processed_df = Parameter( 18 | "save_processed_df", 19 | help="Whether to save the processed dataframe to the run.", 20 | default=True, 21 | type=bool, 22 | ) 23 | 24 | data_dir = Parameter( 25 | "data_dir", 26 | help="The directory to save the processed dataframe to.", 27 | default="data", 28 | type=str, 29 | ) 30 | 31 | parent_flow = Parameter( 32 | "parent_flow", 33 | help="The flow id of the parent flow to process.", 34 | default='MarkdownChunker', 35 | type=str, 36 | ) 37 | 38 | n_bins = Parameter( 39 | "n_bins", 40 | help="The number of bins to use in the histogram.", 41 | default=100, 42 | type=int, 43 | ) 44 | 45 | word_count_threshold = Parameter( 46 | "word_count_threshold", 47 | help="The word count threshold to use in the histogram.", 48 | default=10, 49 | type=int, 50 | ) 51 | 52 | char_count_threshold = Parameter( 53 | "char_count_threshold", 54 | help="The char count threshold to use in processing.", 55 | default=25, 56 | type=int, 57 | ) 58 | 59 | def plot_char_word_histogram(self, char_count_threshold=0, word_count_threshold=0, _df=None, title="", ): 60 | fig, ax = plt.subplots(1, 2, figsize=(12, 4)) 61 | ax[0] = _df.char_count.plot.hist(bins=self.n_bins, color=COLORS['purple'], ax=ax[0]) 62 | ax[1] = _df.word_count.plot.hist(bins=self.n_bins, color=COLORS['light-purple'], ax=ax[1]) 63 | if char_count_threshold > 0 or word_count_threshold > 0: 64 | ax[0].set_xlabel("Filtered character count > %d" % char_count_threshold) 65 | ax[1].set_xlabel("Filtered word count > %d" % word_count_threshold) 66 | ax[0].set_xscale("log") 67 | ax[1].set_xscale("log") 68 | ax[0].set_ylabel("Frequency") 69 | ax[0].spines.right.set_visible(False) 70 | ax[0].spines.top.set_visible(False) 71 | ax[1].spines.right.set_visible(False) 72 | ax[1].spines.top.set_visible(False) 73 | fig.suptitle(title, fontsize=24) 74 | fig.tight_layout() 75 | assert fig is not None, "Figure is None, check plot_char_word_histogram." 76 | return fig 77 | 78 | def plot_tld_count(self): 79 | fig, ax = plt.subplots(1, 1, figsize=(12, 4)) 80 | self.processed_df.groupby('tld').count()['index'].sort_values(ascending=False).plot.bar( 81 | ax=ax, color=COLORS['gold'] 82 | ) 83 | fig.suptitle("Top-level domain representation in the dataset", fontsize=24) 84 | ax.set_xticklabels(ax.get_xticklabels(), rotation = 45) 85 | ax.set_xlabel("") 86 | ax.set_ylabel("Frequency") 87 | fig.tight_layout() 88 | assert fig is not None, "Figure is None, check plot_tld_count." 89 | return fig 90 | 91 | @card 92 | @step 93 | def start(self): 94 | 95 | parent_flow = Flow(self.parent_flow) 96 | run = parent_flow.latest_run 97 | 98 | if not run.successful: 99 | print("Skipping processing of unsuccessful run {}.".format(run.id)) 100 | self.processed_df = None 101 | 102 | else: 103 | 104 | current.card.append(Markdown(f"""# Processing data table from run {run.id}""")) 105 | 106 | current.card.append(Markdown(f"""## Filtering rows""")) 107 | df = run.data.df 108 | fig = self.plot_char_word_histogram(_df = df, title="Before filtering") 109 | current.card.append(Image.from_matplotlib(fig)) 110 | 111 | # Filter out rows with less than N words. 112 | _df = df[df.word_count > self.word_count_threshold] 113 | 114 | # Filter out rows with less than M chars. 115 | _df = _df[_df.char_count > self.char_count_threshold] 116 | 117 | # Feature: Add a column for the TLD. 118 | import tldextract 119 | def get_tld(url): 120 | ext = tldextract.extract(url) 121 | if ext.subdomain != '': 122 | url = '.'.join(ext[:3]) 123 | else: 124 | url = '.'.join(ext[1:3]) 125 | return "https://" + url 126 | _df['tld'] = _df['page_url'].apply(get_tld) 127 | 128 | # Reindex and keep index in upstream dataframe. 129 | _df.reset_index(inplace=True) 130 | _df.index = range(len(_df)) 131 | 132 | fig = self.plot_char_word_histogram( 133 | word_count_threshold=self.word_count_threshold, 134 | char_count_threshold=self.char_count_threshold, 135 | _df = _df, title="After filtering" 136 | ) 137 | current.card.append(Image.from_matplotlib(fig)) 138 | 139 | ### ADD MORE FILTERS HERE. 140 | print("Filtered dataframe from shape {} to shape {}.".format( 141 | df.shape, _df.shape)) 142 | self.processed_df = _df 143 | 144 | # Plot the number of rows per TLD. 145 | fig = self.plot_tld_count() 146 | current.card.append(Image.from_matplotlib(fig)) 147 | 148 | ### ADD MORE SUMMARY STATS HERE. 149 | 150 | self.next(self.end) 151 | 152 | @step 153 | def end(self): 154 | 155 | import os 156 | 157 | print("The {} run {} has ended, with a dataframe of shape: {}".format( 158 | current.flow_name, current.run_id, 159 | self.processed_df.shape)) 160 | print( 161 | f""" 162 | You can now use the dataframe to do whatever you want. 163 | To load it in a notebook, you can use the following code: 164 | 165 | from metaflow import Flow, namespace 166 | namespace('{current.namespace}') 167 | run = Run('{current.flow_name}/{current.run_id}') 168 | processed_df = run.data.processed_df 169 | print(processed_df.shape) 170 | """ 171 | ) 172 | 173 | if self.save_processed_df: 174 | self.processed_df_path = os.path.abspath( 175 | os.path.join(self.data_dir, 'processed_df_%s.csv' % current.run_id)) 176 | print("Saving processed dataframe to %s" % self.processed_df_path) 177 | self.processed_df.to_csv(self.processed_df_path, index=False) 178 | 179 | 180 | if __name__ == '__main__': 181 | DataTableProcessor() -------------------------------------------------------------------------------- /flows/lancedb_index.py: -------------------------------------------------------------------------------- 1 | from metaflow import FlowSpec, step, Flow, current 2 | 3 | class LanceDBVectorIndexer(FlowSpec): 4 | 5 | table_name = "test" 6 | embedding_model = "paraphrase-MiniLM-L6-v2" 7 | embedding_target_col_name = "contents" 8 | 9 | def find_processed_df(self): 10 | for run in Flow('DataTableProcessor'): 11 | if run.data.save_processed_df: 12 | print("Found processed df in run: {}".format(run.id)) 13 | return run.data.processed_df 14 | 15 | @step 16 | def start(self): 17 | self.next(self.create_index) 18 | 19 | @step 20 | def create_index(self): 21 | 22 | from rag_tools.databases.vector_database import LanceDB 23 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder 24 | import pandas as pd 25 | 26 | # fetch data and embed it 27 | self.data = self.find_processed_df() 28 | encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu") 29 | docs = self.data[self.embedding_target_col_name].tolist() 30 | self.ids = list(range(1, len(docs) + 1)) 31 | embeddings = encoder.embed(docs) 32 | self.dimension = len(embeddings[0]) 33 | 34 | # put the vectors in the index 35 | db = LanceDB() 36 | db.create_index(self.table_name, embeddings, docs, self.ids) 37 | 38 | self.next(self.end) 39 | 40 | @step 41 | def end(self): 42 | 43 | from rag_tools.databases.vector_database import LanceDB 44 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder 45 | 46 | db = LanceDB() 47 | 48 | # search the index in a test query 49 | K = 3 50 | test_prompt = "aws" 51 | encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu") 52 | self.search_vector = encoder.embed([test_prompt])[0] 53 | self.results = db.vector_search(self.table_name, self.search_vector, k=K) 54 | 55 | print(f""" 56 | Access flow results with: 57 | 58 | from metaflow import Run 59 | run = Run('{current.flow_name}/{current.run_id}') 60 | results = run.data.results 61 | 62 | Resume LanceDBVectorIndexer with: 63 | 64 | from rag_tools.databases.vector_database import LanceDB 65 | db = LanceDB() # default storage location is `../../chatbot.lance`, relative to your cwd. 66 | db.vector_search(table_name, search_vector, k=K) 67 | """) 68 | 69 | 70 | if __name__ == '__main__': 71 | LanceDBVectorIndexer() -------------------------------------------------------------------------------- /flows/markdown_chunker.py: -------------------------------------------------------------------------------- 1 | from metaflow import FlowSpec, step, current, schedule 2 | from rag_tools.filetypes.markdown import Mixin as MarkdownMixin 3 | 4 | @schedule(weekly=True) 5 | class MarkdownChunker(FlowSpec, MarkdownMixin): 6 | 7 | @step 8 | def start(self): 9 | """ 10 | Start the flow. 11 | Try to download the content from the repository. 12 | """ 13 | 14 | # see config.py for the definition of repo_params 15 | # it is a list of dictionaries, 16 | # that tell the Markdown tools where to look for content. 17 | # see /notebooks/markdonw_repo_explorer.ipynb for more details. 18 | from config.repo_params import SAMPLE_OSS_MARKDOWN_REPOS 19 | 20 | self.repo_params = SAMPLE_OSS_MARKDOWN_REPOS 21 | self.df = self.load_df_from_repo_list() 22 | self.next(self.end) 23 | 24 | @step 25 | def end(self): 26 | print("The flow has ended, with a dataframe of shape: {}".format(self.df.shape)) 27 | print( 28 | f""" 29 | You can now use the dataframe to do whatever you want. 30 | To load it in a notebook, you can use the following code: 31 | 32 | from metaflow import Flow, namespace 33 | namespace('{current.namespace}') 34 | run = Run('{current.flow_name}/{current.run_id}') 35 | df = run.data.df 36 | print(df.shape) 37 | """) 38 | 39 | 40 | if __name__ == "__main__": 41 | MarkdownChunker() -------------------------------------------------------------------------------- /flows/pinecone_index.py: -------------------------------------------------------------------------------- 1 | from metaflow import FlowSpec, step, Flow, environment 2 | import os 3 | 4 | env_vars = { 5 | 'PINECONE_API_KEY': os.environ['PINECONE_API_KEY'], 6 | 'GCP_ENVIRONMENT': os.environ['GCP_ENVIRONMENT'] 7 | } 8 | 9 | class PineconeVectorIndexer(FlowSpec): 10 | 11 | index_name = "test" 12 | embedding_model = "paraphrase-MiniLM-L6-v2" 13 | embedding_target_col_name = "contents" 14 | 15 | def find_processed_df(self): 16 | for run in Flow('DataTableProcessor'): 17 | if run.data.save_processed_df: 18 | print("Found processed df in run: {}".format(run.id)) 19 | return run.data.processed_df 20 | 21 | @step 22 | def start(self): 23 | self.next(self.create_index) 24 | 25 | @environment(vars=env_vars) 26 | @step 27 | def create_index(self): 28 | 29 | from rag_tools.databases.vector_database import PineconeDB 30 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder 31 | import pandas as pd 32 | 33 | # fetch data and embed it 34 | self.data = self.find_processed_df() 35 | encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu") 36 | docs = self.data[self.embedding_target_col_name].tolist() 37 | self.ids = list(range(1, len(docs) + 1)) 38 | embeddings = encoder.embed(docs) 39 | self.dimension = len(embeddings[0]) 40 | 41 | # create the index 42 | db = PineconeDB() 43 | db.create_index(self.index_name, dimension=self.dimension) 44 | 45 | # put the vectors in the index 46 | db.upsert(self.index_name, embeddings, docs, self.ids) 47 | 48 | self.next(self.end) 49 | 50 | @environment(vars=env_vars) 51 | @step 52 | def end(self): 53 | 54 | from rag_tools.databases.vector_database import PineconeDB 55 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder 56 | 57 | # create_index is idempotent 58 | db = PineconeDB() 59 | db.create_index(self.index_name, dimension=self.dimension) 60 | 61 | # search the index in a test query 62 | K = 3 63 | test_prompt = "aws" 64 | encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu") 65 | self.search_vector = encoder.embed([test_prompt])[0] 66 | self.results = db.vector_search(self.index_name, self.search_vector, k=K).to_dict() 67 | 68 | for result in self.results['matches']: 69 | print("\n\nid: {} - score: {} \n\n{}\n\n".format(result['id'], result['score'], result['metadata']['text'])) 70 | print("===============================================") 71 | 72 | print("\n\n Flow is done, check for results in the {} index at https://app.pinecone.io/.".format(self.index_name)) 73 | 74 | 75 | if __name__ == '__main__': 76 | PineconeVectorIndexer() -------------------------------------------------------------------------------- /flows/test_simple_embedding.py: -------------------------------------------------------------------------------- 1 | from rag_tools.databases.vector_database import LanceDB, PineconeDB 2 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder 3 | from metaflow import Flow 4 | import pandas as pd 5 | 6 | def find_processed_df(): 7 | for run in Flow('DataTableProcessor'): 8 | if run.data.save_processed_df: 9 | print("Found processed df in run: {}".format(run.id)) 10 | return run.data.processed_df 11 | 12 | # fetch data and embed it 13 | data = find_processed_df() 14 | encoder = SentenceTransformerEmbedder("paraphrase-MiniLM-L6-v2", device="cpu") 15 | docs = data["contents"].tolist() 16 | ids = list(range(1, len(docs) + 1)) 17 | embeddings = encoder.embed(docs) 18 | 19 | DB = "lance" 20 | 21 | if DB == "lance": 22 | db = LanceDB() 23 | db.create_index("test", embeddings, docs, ids) 24 | 25 | elif DB == "pinecone": 26 | db = PineconeDB() 27 | db.create_index("test", dimension=len(embeddings[0])) 28 | db.upsert("test", embeddings, docs, ids) 29 | 30 | # search the index 31 | K = 3 32 | user_prompt = "aws" 33 | search_vector = encoder.embed([user_prompt])[0] 34 | results = db.vector_search("test", search_vector, k=K) #.to_dict() 35 | 36 | for result in results['matches']: 37 | print("id: {} - distance score: {} \n\n{}\n\n".format(result['id'], result['score'], result['metadata']['text'])) 38 | 39 | db.destroy_index("test") -------------------------------------------------------------------------------- /flows/test_simple_query.py: -------------------------------------------------------------------------------- 1 | import os 2 | from rag_tools.databases.vector_database import LanceDB 3 | from rag_tools.llms.llms_search import ChatGPTWrapper 4 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder 5 | 6 | # import lancedb 7 | # URI = "data/sample-lancedb" 8 | # db = lancedb.connect(URI) 9 | 10 | query = "How do I specify conda dependencies in my flow?" 11 | 12 | # embed with sentence transformer 13 | encoder = SentenceTransformerEmbedder("paraphrase-MiniLM-L6-v2", device="cpu") 14 | search_vector = encoder.embed([query])[0] 15 | 16 | # embed with sentence transformer 17 | best_text = db.vector_search(search_vector, k=2) 18 | 19 | # we build some context for the question 20 | text = "\n\n".join(best_text['text'].tolist()) 21 | 22 | # query prompt for chatgpt 23 | prompt = f"Please answer this question {query}\n\nhere's the context you should use:\n\n{text}.\n\nIf the answer is not provided in the context, answer I don't know." 24 | 25 | output = ChatGPTWrapper(os.environ['OPENAI_API_KEY']).sample(prompt) 26 | 27 | print(f"Question: {prompt}") 28 | print() 29 | print(f"Answer: {output}") -------------------------------------------------------------------------------- /images/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/images/cover.png -------------------------------------------------------------------------------- /notebooks/llama_index_exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Set your API Key" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# %env OPENAI_API_KEY=" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# Import dependencies" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 61, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# built-ins\n", 33 | "from pathlib import Path\n", 34 | "from functools import partial\n", 35 | "import os\n", 36 | "from typing import Tuple, List\n", 37 | "from dataclasses import dataclass\n", 38 | "\n", 39 | "# custom lib\n", 40 | "from rag_tools.repo.ops import ensure_repo_exists_locally, DocumentationExtractor\n", 41 | "\n", 42 | "# third party libs\n", 43 | "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, Document, download_loader\n", 44 | "from llama_index.retrievers import VectorIndexRetriever\n", 45 | "from llama_index.response_synthesizers import get_response_synthesizer\n", 46 | "from llama_index.llms import OpenAI\n", 47 | "from metaflow import Flow\n", 48 | "import pandas as pd\n", 49 | "from IPython.display import display, Markdown" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "# Define utility functions" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 11, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "@dataclass\n", 66 | "class Context:\n", 67 | " def __init__(self, response: str, source_node_ids: List[str]):\n", 68 | " self.response = response\n", 69 | " self.source_node_ids = source_node_ids\n", 70 | "\n", 71 | " def get_link_df(self, meta_df, link_col = 'doc_id'):\n", 72 | " return meta_df[meta_df[link_col].isin(self.source_node_ids)]\n", 73 | "\n", 74 | "def qa_iter(\n", 75 | " question: str, \n", 76 | " index: VectorStoreIndex, \n", 77 | " k:int = 2, \n", 78 | " response_mode:str = 'tree_summarize'\n", 79 | ") -> Context:\n", 80 | " \"Match a question against an index and returns the response.\"\n", 81 | " retriever = VectorIndexRetriever(index=index, similarity_top_k=k)\n", 82 | " response_synthesizer = get_response_synthesizer(response_mode=response_mode)\n", 83 | " query_engine = index.as_query_engine(response_synthesizer=response_synthesizer, retriever=retriever)\n", 84 | " query_res = query_engine.query(question)\n", 85 | " return Context(\n", 86 | " response=query_res.response, source_node_ids=list(query_res.metadata.keys())\n", 87 | " )\n", 88 | "\n", 89 | "dm = lambda x: display(Markdown(x))\n", 90 | "def dmqa(q, a): \n", 91 | " dm(f\"\"\"\n", 92 | "**Question:** {q}\n", 93 | "\n", 94 | "**Answer:** {a}\n", 95 | "\"\"\"\n", 96 | ")\n", 97 | "\n", 98 | "def nb_output_format(question, response, similar_chunk_df):\n", 99 | " dm(f\"#### {question}\")\n", 100 | " dm(f\"**Retrieved Response**\")\n", 101 | " dm(response)\n", 102 | " dm(f\"#### Sources\")\n", 103 | " for _, chunk in similar_chunk_df.iterrows():\n", 104 | " dm(f\"##### [{chunk.header}]({chunk.page_url})\")\n", 105 | " dm(f\"{chunk.contents[:100]}...\")\n", 106 | "\n", 107 | "def get_documents_from_content_section_df(df):\n", 108 | " ids = []; documents = []\n", 109 | " for i, text in enumerate(df.contents):\n", 110 | " doc = Document(text=text, id_=i)\n", 111 | " documents.append(doc)\n", 112 | " ids.append(doc.id_)\n", 113 | " return documents, ids\n", 114 | "\n", 115 | "def generative_search_engine_iter(question, index, meta_df, meta_df_id_col='doc_id'):\n", 116 | " \"Assumes index and df are defined in the global scope\"\n", 117 | " context = qa_iter(question, index)\n", 118 | " similar_chunk_df = meta_df[meta_df[meta_df_id_col].isin(context.source_node_ids)]\n", 119 | " nb_output_format(question, context.response, similar_chunk_df)\n", 120 | "\n", 121 | "def get_documents_from_md_file_paths(fps):\n", 122 | " MarkdownReader = download_loader(\"MarkdownReader\")\n", 123 | " loader = MarkdownReader()\n", 124 | " documents = []\n", 125 | " for fp in fps:\n", 126 | " documents += loader.load_data(file=Path(fp))\n", 127 | " return documents" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "# 🛑 Set variables based on your machine's setup" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 4, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# these are part of this repo, so you don't need to change\n", 144 | "DATA_DIR='../data'\n", 145 | "LLAMA_INDEX_TUTORIAL_DATA = os.path.join(DATA_DIR, 'llama-index-tutorial')\n", 146 | "\n", 147 | "# this is unique to your machine. where did you clone https://github.com/Netflix/metaflow-docs to?\n", 148 | "YOUR_LOCAL_METAFLOW_DOCS_REPO_PATH = os.path.expanduser(\"~/Dev/metaflow-docs\")" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "# Which GitHub repos?" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 5, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "REPO_PARAMS = [\n", 165 | " {\n", 166 | " \"deployment_url\": \"docs.metaflow.org\",\n", 167 | " \"repository_path\": \"https://github.com/Netflix/metaflow-docs\",\n", 168 | " \"repository_ref\": \"master\",\n", 169 | " \"base_search_path\": \"docs\",\n", 170 | " \"exclude_paths\": [\"docs/v\"],\n", 171 | " \"exclude_files\": [\"README.md\", \"README\"],\n", 172 | " }\n", 173 | "]" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "# 1. Llama Index Warmup" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 63, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "# This cell will trigger llama_index to look for OPENAI_API_KEY in environment variables,\n", 190 | "# then default to downloading a llama 2 model binary locally.\n", 191 | "\n", 192 | "# this thing is looking for a .txt file in the data dir.\n", 193 | "documents = SimpleDirectoryReader(LLAMA_INDEX_TUTORIAL_DATA).load_data()\n", 194 | "\n", 195 | "# Indexing is the first, and most crucial, stage in a RAG workflow.\n", 196 | "# It is the process of converting a set of documents into a vector representation.\n", 197 | "# This vector representation is later used to retrieve relevant documents for a given query.\n", 198 | "service_context = ServiceContext.from_defaults(\n", 199 | " llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.0)\n", 200 | ")\n", 201 | "index = VectorStoreIndex.from_documents(documents, service_context=service_context) " 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 64, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/markdown": [ 212 | "The author worked on writing and programming outside of school before college. They wrote short stories and tried writing programs on an IBM 1401 computer. They also built a microcomputer kit and started programming on it, writing simple games and a word processor." 213 | ], 214 | "text/plain": [ 215 | "" 216 | ] 217 | }, 218 | "metadata": {}, 219 | "output_type": "display_data" 220 | } 221 | ], 222 | "source": [ 223 | "# Query engine \"takes in a natural language query, and returns a response, along with reference context retrieved and passed to the LLM.\"\n", 224 | "query_engine = index.as_query_engine()\n", 225 | "response = query_engine.query(\"What did the author do growing up?\")\n", 226 | "dm(response.response)\n", 227 | "\n", 228 | "# Taking too many minutes to run with Llama 2 on my macbook :( \n", 229 | "# Conservatively estimate OpenAI API is ~$1 per dozen end-to-end runs of this notebook." 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "## Try a [Llama Hub tool for parsing `.md` files](https://llamahub.ai/l/file-markdown)\n", 237 | "\n", 238 | "This could be used in conjunction or in place of the custom markdown parser used in `./markdown_chunker.py`" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 40, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "from pathlib import Path\n", 248 | "from llama_index import download_loader\n", 249 | "import os" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 41, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "MarkdownReader = download_loader(\"MarkdownReader\")\n", 259 | "loader = MarkdownReader()\n", 260 | "\n", 261 | "# start with a single document\n", 262 | "test_path = os.path.abspath(\"%s/test-data/ob/blog/metaflow-fast-data.md\" % DATA_DIR)\n", 263 | "documents = loader.load_data(file=Path(test_path))" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 65, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "index = VectorStoreIndex.from_documents(documents, service_context=service_context)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "## Ask some questions over the index" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 66, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/markdown": [ 290 | "\n", 291 | "**Question:** What is the fastest way to load data onto AWS Batch instances using Metaflow?\n", 292 | "\n", 293 | "**Answer:** The fastest way to load data onto AWS Batch instances using Metaflow would be to utilize the data loading capabilities provided by Metaflow itself. Metaflow offers built-in functionality for handling data loading and processing, allowing you to efficiently transfer and process data on AWS Batch instances. By leveraging Metaflow's data loading features, you can optimize the loading process and ensure efficient utilization of AWS Batch resources.\n" 294 | ], 295 | "text/plain": [ 296 | "" 297 | ] 298 | }, 299 | "metadata": {}, 300 | "output_type": "display_data" 301 | } 302 | ], 303 | "source": [ 304 | "question = (\n", 305 | " \"What is the fastest way to load data onto AWS Batch instances using Metaflow?\"\n", 306 | ")\n", 307 | "\n", 308 | "query_engine = index.as_query_engine()\n", 309 | "response = query_engine.query(question).response\n", 310 | "\n", 311 | "dmqa(question, response)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 68, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/markdown": [ 322 | "\n", 323 | "**Question:** How does Metaflow use `tmpfs`?\n", 324 | "\n", 325 | "**Answer:** Metaflow uses `tmpfs` to store temporary data during the execution of workflows. `tmpfs` is a temporary file system that resides in memory, which means that the data stored in `tmpfs` is not persisted across system reboots. This makes it ideal for storing temporary data that is only needed during the execution of a workflow and can be discarded afterwards. By using `tmpfs`, Metaflow can achieve faster read and write operations compared to using disk-based storage.\n" 326 | ], 327 | "text/plain": [ 328 | "" 329 | ] 330 | }, 331 | "metadata": {}, 332 | "output_type": "display_data" 333 | } 334 | ], 335 | "source": [ 336 | "question = \"How does Metaflow use `tmpfs`?\"\n", 337 | "response = query_engine.query(question).response\n", 338 | "dmqa(question, response)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "# 2. Controlling hallucinations by curating an index\n", 346 | "Here are a few questions we will explore in this section:\n", 347 | "\n", 348 | "**What is the problem with the above workflow?**\n", 349 | "\n", 350 | "In the previous section's index, we created all the vectors from chunks of a [post](https://outerbounds.com/blog/metaflow-fast-data/) specifically about the `tmpfs` feature. [Metaflow docs](https://docs.metaflow.org/) don't contain that much content about `tmpfs` yet.\n", 351 | "\n", 352 | "If we use the Metaflow docs as the source objects to populate the index, and a question is asked to the model about `tmpfs`, how can we know if it is hallucinating it, or referencing an actual piece of content that Metaflow maintainers endorse?\n", 353 | "> Bing Chat AI:\n", 354 | "Give a bunch of links to the content that LLM response was conditioned on.\n", 355 | "\n", 356 | "We will build a simple system like this in the next section. First, let's see the power of understanding the domain of our index, and then move to using it as a way to reference source material in the generated response." 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "## Index Metaflow docs" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 69, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "file_paths = DocumentationExtractor().filter_files(\n", 373 | " YOUR_LOCAL_METAFLOW_DOCS_REPO_PATH,\n", 374 | " base_search_path = \"docs\",\n", 375 | " exclude_paths = [\"docs/v\"],\n", 376 | " exclude_files = [\"README.md\", \"README\"],\n", 377 | " considered_extensions = [\".md\"],\n", 378 | ")" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 70, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "file_paths = [abs_path for abs_path, _ in file_paths]" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 71, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "documents = get_documents_from_md_file_paths(file_paths)" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 72, 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "name": "stdout", 406 | "output_type": "stream", 407 | "text": [ 408 | "Showing sample of 1 out of 500 objects\n" 409 | ] 410 | }, 411 | { 412 | "data": { 413 | "text/plain": [ 414 | "[Document(id_='b185a5f6-76a2-4095-9523-adb59d3518c9', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='168707b49cebf5424ed48615defbba683a0d7932f8451d263d491040c37aa87f', text='\\n\\nWelcome to Metaflow\\n\\nMetaflow makes it easy to build and manage real-life data science and machine learning projects.\\n\\n\\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]" 415 | ] 416 | }, 417 | "execution_count": 72, 418 | "metadata": {}, 419 | "output_type": "execute_result" 420 | } 421 | ], 422 | "source": [ 423 | "N = 1\n", 424 | "print(\n", 425 | " \"Showing sample of {m} out of {n} {t} objects\".format(\n", 426 | " m=N, n=len(documents), t=type(documents[0])\n", 427 | " )\n", 428 | ")\n", 429 | "documents[:N]" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 73, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "name": "stdout", 439 | "output_type": "stream", 440 | "text": [ 441 | "CPU times: user 582 ms, sys: 187 ms, total: 770 ms\n", 442 | "Wall time: 14.1 s\n" 443 | ] 444 | } 445 | ], 446 | "source": [ 447 | "%%time\n", 448 | "index = VectorStoreIndex.from_documents(documents)\n", 449 | "# TODO: Measure times as this thing scales with N documents and larger sized documents." 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "## Q&A iterations over the Metaflow docs index" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 74, 462 | "metadata": {}, 463 | "outputs": [ 464 | { 465 | "data": { 466 | "text/markdown": [ 467 | "\n", 468 | "**Question:** What is Metaflow?\n", 469 | "\n", 470 | "**Answer:** Metaflow is a Python library that simplifies the development, deployment, and operation of data-intensive applications, specifically those related to data science and machine learning. It was initially created at Netflix to enhance the efficiency of data scientists working on a range of projects, from traditional statistics to cutting-edge deep learning. Metaflow is an open-source tool released under the Apache License, Version 2.0.\n" 471 | ], 472 | "text/plain": [ 473 | "" 474 | ] 475 | }, 476 | "metadata": {}, 477 | "output_type": "display_data" 478 | } 479 | ], 480 | "source": [ 481 | "question = \"What is Metaflow?\"\n", 482 | "context = qa_iter(question, index)\n", 483 | "dmqa(question, context.response)" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 75, 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "data": { 493 | "text/markdown": [ 494 | "\n", 495 | "**Question:** How do I specify conda dependencies in my flow?\n", 496 | "\n", 497 | "**Answer:** You can specify conda dependencies in your flow using the `@conda_base` and `@conda` decorators. The `@conda_base` decorator is used at the flow level to specify explicit library dependencies, python version, and whether to exclude all steps from executing within a conda environment. The `@conda` decorator is used at the step level to update the explicit library dependencies, python version, and conda environment exclusion as specified by the `@conda_base` decorator. By using these decorators, you can define the conda environment for each step in your flow. Additionally, you can add an explicit dependency on a specific module by using the `@conda` decorator in the corresponding step.\n" 498 | ], 499 | "text/plain": [ 500 | "" 501 | ] 502 | }, 503 | "metadata": {}, 504 | "output_type": "display_data" 505 | } 506 | ], 507 | "source": [ 508 | "question = \"How do I specify conda dependencies in my flow?\"\n", 509 | "context = qa_iter(question, index)\n", 510 | "dmqa(question, context.response)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "### Do Metaflow docs know about `tmpfs` though?" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 76, 523 | "metadata": {}, 524 | "outputs": [ 525 | { 526 | "data": { 527 | "text/markdown": [ 528 | "\n", 529 | "**Question:** How does Metaflow use `tmpfs`?\n", 530 | "\n", 531 | "**Answer:** Metaflow does not use `tmpfs` based on the given information.\n" 532 | ], 533 | "text/plain": [ 534 | "" 535 | ] 536 | }, 537 | "metadata": {}, 538 | "output_type": "display_data" 539 | } 540 | ], 541 | "source": [ 542 | "question = \"How does Metaflow use `tmpfs`?\"\n", 543 | "dmqa(question, qa_iter(question, index).response)" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "### Adding specific knowledge to the index\n", 551 | "\n", 552 | "As of August, 2023, the Metaflow documentation do not have much writing about `tmpfs`, so this makes sense.\n", 553 | "\n", 554 | "How can we add [Outerbounds blog post](https://outerbounds.com/blog/metaflow-fast-data/) that announced the `tmpfs` and Metaflow integration to the index, to give the model the context it needs to answer this question?\n", 555 | "\n", 556 | "Let's create an index that combines the one we saw earlier for the `tmpfs` blog post with the one we just created for Metaflow docs." 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 78, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "fast_data_file_path = os.path.abspath('%s/test-data/ob/blog/metaflow-fast-data.md' % DATA_DIR)\n", 566 | "\n", 567 | "# combining the document set\n", 568 | "fast_data_doc = get_documents_from_md_file_paths([fast_data_file_path])\n", 569 | "index_fast_data_post = VectorStoreIndex.from_documents(fast_data_doc)" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 79, 575 | "metadata": {}, 576 | "outputs": [ 577 | { 578 | "data": { 579 | "text/markdown": [ 580 | "\n", 581 | "**Question:** How does Metaflow use `tmpfs`?\n", 582 | "\n", 583 | "**Answer:** Metaflow recently implemented support for memory-based `tmpfs` filesystem on AWS Batch and Kubernetes. This feature allows users to create an ephemeral filesystem backed by memory on the fly, without making any changes to the infrastructure. By enabling this feature using the `@batch(use_tmpfs=True)` decorator for AWS Batch workloads or `@kubernetes(use_tmpfs=True)` decorator for Kubernetes, the `metaflow.S3` client is automatically aware of the `tmpfs` volume and will use it to speed up downloads. This helps improve the performance of data downloads from S3 in Metaflow workflows.\n" 584 | ], 585 | "text/plain": [ 586 | "" 587 | ] 588 | }, 589 | "metadata": {}, 590 | "output_type": "display_data" 591 | } 592 | ], 593 | "source": [ 594 | "# same question as above. now we can answer it with the new index.\n", 595 | "question = \"How does Metaflow use `tmpfs`?\"\n", 596 | "dmqa(question, qa_iter(question, index_fast_data_post).response)" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 81, 602 | "metadata": {}, 603 | "outputs": [], 604 | "source": [ 605 | "# indexes are updateable/composable! \n", 606 | "for doc_chunk in fast_data_doc:\n", 607 | " index.insert(doc_chunk)" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 82, 613 | "metadata": {}, 614 | "outputs": [ 615 | { 616 | "data": { 617 | "text/markdown": [ 618 | "\n", 619 | "**Question:** How does Metaflow use `tmpfs`?\n", 620 | "\n", 621 | "**Answer:** Metaflow uses `tmpfs` by implementing support for memory-based `tmpfs` filesystem on Batch and Kubernetes. This allows users to create an ephemeral filesystem backed by memory on the fly, without having to make any changes on the infrastructure side. When the `tmpfs` feature is enabled, the `metaflow.S3` client automatically uses it to speed up downloads. To enable this feature, users can add `@batch(use_tmpfs=True)` for AWS Batch workloads or `@kubernetes(use_tmpfs=True)` for Kubernetes in their Metaflow code.\n" 622 | ], 623 | "text/plain": [ 624 | "" 625 | ] 626 | }, 627 | "metadata": {}, 628 | "output_type": "display_data" 629 | }, 630 | { 631 | "data": { 632 | "text/markdown": [ 633 | "\n", 634 | "**Question:** How do I specify conda dependencies in my flow?\n", 635 | "\n", 636 | "**Answer:** You can specify conda dependencies in your flow using the `@conda_base` and `@conda` decorators. The `@conda_base` decorator is used at the flow level to specify explicit library dependencies, python version, and whether to exclude all steps from executing within a conda environment. The `@conda` decorator is used at the step level to update the explicit library dependencies, python version, and conda environment exclusion as specified by the `@conda_base` decorator. By using these decorators, you can define the conda environment for each step in your flow. Additionally, you can add an explicit dependency on a specific module by using the `@conda` decorator in the desired step.\n" 637 | ], 638 | "text/plain": [ 639 | "" 640 | ] 641 | }, 642 | "metadata": {}, 643 | "output_type": "display_data" 644 | } 645 | ], 646 | "source": [ 647 | "# same questions as above. now we can answer it with the new index.\n", 648 | "question = \"How does Metaflow use `tmpfs`?\"\n", 649 | "dmqa(question, qa_iter(question, index).response)\n", 650 | "\n", 651 | "# and this one too.\n", 652 | "question = \"How do I specify conda dependencies in my flow?\"\n", 653 | "dmqa(question, qa_iter(question, index).response)" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": {}, 659 | "source": [ 660 | "# 3. Constructing an index based on sections of endorsed content we can link to\n", 661 | "[Parse the Documents into Nodes](https://gpt-index.readthedocs.io/en/latest/end_to_end_tutorials/usage_pattern.html#parse-the-documents-into-nodes)" 662 | ] 663 | }, 664 | { 665 | "cell_type": "markdown", 666 | "metadata": {}, 667 | "source": [ 668 | "## Fetch all file paths of .md files" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 83, 674 | "metadata": {}, 675 | "outputs": [ 676 | { 677 | "name": "stdout", 678 | "output_type": "stream", 679 | "text": [ 680 | "Looking for remote repository at https://github.com/Netflix/metaflow-docs\n", 681 | "Looking for remote repository at https://github.com/huggingface/accelerate\n" 682 | ] 683 | } 684 | ], 685 | "source": [ 686 | "from rag_tools.filetypes.markdown import Mixin as mm\n", 687 | "# this cell is like a condensed version of `/flows/markdown_chunker.py`\n", 688 | "_mm = mm()\n", 689 | "_mm.repo_params = REPO_PARAMS\n", 690 | "\n", 691 | "# this is an unprocessed df, so you may want to clean it as /flows/data_table_processor.py does.\n", 692 | "df = _mm.load_df_from_repo_list()" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": 84, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [ 701 | "documents, ids = get_documents_from_content_section_df(df)\n", 702 | "df['doc_ids'] = ids" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": 85, 708 | "metadata": {}, 709 | "outputs": [], 710 | "source": [ 711 | "index = VectorStoreIndex.from_documents(documents)" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": 100, 717 | "metadata": {}, 718 | "outputs": [ 719 | { 720 | "data": { 721 | "text/markdown": [ 722 | "The above instructions work even if you use [`@conda`\n", 723 | " decorators](/scaling/dependencies#managing-dependencies-with-conda-decorator) in your\n", 724 | " code; you need, however, to ensure that the `conda` binary is available in your `PATH`.\n", 725 | " The easiest way to do this is to set the `PATH` environment variable to properly include\n", 726 | " the path to the `conda` binary if it is in a non-standard location. In VSCode, you can\n", 727 | " simply add this value in the env section of launch.json and in PyCharm, the UI allows\n", 728 | " you to set environment variables." 729 | ], 730 | "text/plain": [ 731 | "" 732 | ] 733 | }, 734 | "metadata": {}, 735 | "output_type": "display_data" 736 | } 737 | ], 738 | "source": [ 739 | "import random\n", 740 | "dm(random.choice(documents).text)" 741 | ] 742 | }, 743 | { 744 | "cell_type": "markdown", 745 | "metadata": {}, 746 | "source": [ 747 | "# 4. Use Cases" 748 | ] 749 | }, 750 | { 751 | "cell_type": "markdown", 752 | "metadata": {}, 753 | "source": [ 754 | "## Load a dataframe of text chunks and metadata from your latest workflow runs" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": 6, 760 | "metadata": {}, 761 | "outputs": [ 762 | { 763 | "name": "stdout", 764 | "output_type": "stream", 765 | "text": [ 766 | "1692847583518101\n" 767 | ] 768 | } 769 | ], 770 | "source": [ 771 | "# find latest Metaflow run that saved processed df\n", 772 | "run = None\n", 773 | "for _run in Flow('DataTableProcessor'):\n", 774 | " if _run.data.save_processed_df:\n", 775 | " run = _run\n", 776 | " break\n", 777 | "\n", 778 | "print(run.id)" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": 7, 784 | "metadata": {}, 785 | "outputs": [], 786 | "source": [ 787 | "df = run.data.processed_df\n", 788 | "documents, ids = get_documents_from_content_section_df(df)\n", 789 | "df['doc_id'] = ids\n", 790 | "index = VectorStoreIndex(documents)" 791 | ] 792 | }, 793 | { 794 | "cell_type": "markdown", 795 | "metadata": {}, 796 | "source": [ 797 | "## Q&A" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": 12, 803 | "metadata": {}, 804 | "outputs": [ 805 | { 806 | "name": "stdout", 807 | "output_type": "stream", 808 | "text": [ 809 | "['1', '1209']\n" 810 | ] 811 | }, 812 | { 813 | "data": { 814 | "text/markdown": [ 815 | "#### What is Metaflow?" 816 | ], 817 | "text/plain": [ 818 | "" 819 | ] 820 | }, 821 | "metadata": {}, 822 | "output_type": "display_data" 823 | }, 824 | { 825 | "data": { 826 | "text/markdown": [ 827 | "**Retrieved Response**" 828 | ], 829 | "text/plain": [ 830 | "" 831 | ] 832 | }, 833 | "metadata": {}, 834 | "output_type": "display_data" 835 | }, 836 | { 837 | "data": { 838 | "text/markdown": [ 839 | "Metaflow is a Python library that simplifies the development, deployment, and operation of data-intensive applications, particularly those related to data science and machine learning. It was initially created at Netflix to enhance the productivity of data scientists working on various projects. Metaflow is available as an open-source framework under the Apache License, Version 2.0. It allows data scientists to focus on important aspects like feature engineering and model development while abstracting away tasks such as job organization, orchestration, scheduling, and interaction with data warehouses. Additionally, Metaflow enables the building of production-ready machine learning workflows using a simple Python API and facilitates seamless transitions between local prototyping environments and cloud-based deployments." 840 | ], 841 | "text/plain": [ 842 | "" 843 | ] 844 | }, 845 | "metadata": {}, 846 | "output_type": "display_data" 847 | }, 848 | { 849 | "data": { 850 | "text/markdown": [ 851 | "#### Sources" 852 | ], 853 | "text/plain": [ 854 | "" 855 | ] 856 | }, 857 | "metadata": {}, 858 | "output_type": "display_data" 859 | }, 860 | { 861 | "data": { 862 | "text/markdown": [ 863 | "##### [What is Metaflow](https://docs.metaflow.org/introduction/what-is-metaflow#what-is-metaflow)" 864 | ], 865 | "text/plain": [ 866 | "" 867 | ] 868 | }, 869 | "metadata": {}, 870 | "output_type": "display_data" 871 | }, 872 | { 873 | "data": { 874 | "text/markdown": [ 875 | "Metaflow is a human-friendly Python library that makes it straightforward to develop, deploy, and op..." 876 | ], 877 | "text/plain": [ 878 | "" 879 | ] 880 | }, 881 | "metadata": {}, 882 | "output_type": "display_data" 883 | }, 884 | { 885 | "data": { 886 | "text/markdown": [ 887 | "##### [Reproducible ML pipelines with Metaflow](https://outerbounds.com/blog/machine-learning-pipelines-from-prototype-to-production#reproducible-ml-pipelines-with-metaflow)" 888 | ], 889 | "text/plain": [ 890 | "" 891 | ] 892 | }, 893 | "metadata": {}, 894 | "output_type": "display_data" 895 | }, 896 | { 897 | "data": { 898 | "text/markdown": [ 899 | "To help data scientists focus on the parts of the stack they really care about, such as feature engi..." 900 | ], 901 | "text/plain": [ 902 | "" 903 | ] 904 | }, 905 | "metadata": {}, 906 | "output_type": "display_data" 907 | } 908 | ], 909 | "source": [ 910 | "question = \"What is Metaflow?\"\n", 911 | "generative_search_engine_iter(question, index, df)" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": 108, 917 | "metadata": {}, 918 | "outputs": [ 919 | { 920 | "data": { 921 | "text/markdown": [ 922 | "#### How does Metaflow work with Kubernetes?" 923 | ], 924 | "text/plain": [ 925 | "" 926 | ] 927 | }, 928 | "metadata": {}, 929 | "output_type": "display_data" 930 | }, 931 | { 932 | "data": { 933 | "text/markdown": [ 934 | "**Retrieved Response**" 935 | ], 936 | "text/plain": [ 937 | "" 938 | ] 939 | }, 940 | "metadata": {}, 941 | "output_type": "display_data" 942 | }, 943 | { 944 | "data": { 945 | "text/markdown": [ 946 | "Metaflow now has the capability to run on top of any Kubernetes cluster. Users can run all or parts of any Metaflow flow on Kubernetes from their workstation by using the command \"run --with kubernetes\". Additionally, users can deploy their flow to Argo Workflows, a Kubernetes-native workflow scheduler, with a single command \"argo-workflows create\" to execute the flow asynchronously. For more information on setting up and operating Kubernetes for Metaflow, users can refer to the engineering resources provided by Metaflow." 947 | ], 948 | "text/plain": [ 949 | "" 950 | ] 951 | }, 952 | "metadata": {}, 953 | "output_type": "display_data" 954 | }, 955 | { 956 | "data": { 957 | "text/markdown": [ 958 | "#### Sources" 959 | ], 960 | "text/plain": [ 961 | "" 962 | ] 963 | }, 964 | "metadata": {}, 965 | "output_type": "display_data" 966 | }, 967 | { 968 | "data": { 969 | "text/markdown": [ 970 | "##### [Add capability to launch Metaflow tasks on Kubernetes and schedule Metaflow flows with Argo Workflows.](https://docs.metaflow.org/internals/release-notes#add-capability-to-launch-metaflow-tasks-on-kubernetes-and-schedule-metaflow-flows-with-argo-workflows)" 971 | ], 972 | "text/plain": [ 973 | "" 974 | ] 975 | }, 976 | "metadata": {}, 977 | "output_type": "display_data" 978 | }, 979 | { 980 | "data": { 981 | "text/markdown": [ 982 | "This release enables brand new capabilities for [Metaflow on top of\n", 983 | " Kubernetes](https://outerbounds..." 984 | ], 985 | "text/plain": [ 986 | "" 987 | ] 988 | }, 989 | "metadata": {}, 990 | "output_type": "display_data" 991 | }, 992 | { 993 | "data": { 994 | "text/markdown": [ 995 | "##### [Using Kubernetes](https://docs.metaflow.org/scaling/remote-tasks/kubernetes#using-kubernetes)" 996 | ], 997 | "text/plain": [ 998 | "" 999 | ] 1000 | }, 1001 | "metadata": {}, 1002 | "output_type": "display_data" 1003 | }, 1004 | { 1005 | "data": { 1006 | "text/markdown": [ 1007 | "Here are some useful tips and tricks related to running Metaflow on Kubernetes. See our\n", 1008 | " engineering..." 1009 | ], 1010 | "text/plain": [ 1011 | "" 1012 | ] 1013 | }, 1014 | "metadata": {}, 1015 | "output_type": "display_data" 1016 | } 1017 | ], 1018 | "source": [ 1019 | "question = \"How does Metaflow work with Kubernetes?\"\n", 1020 | "generative_search_engine_iter(question, index, df)" 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "code", 1025 | "execution_count": 109, 1026 | "metadata": {}, 1027 | "outputs": [ 1028 | { 1029 | "data": { 1030 | "text/markdown": [ 1031 | "#### What is a DAG?" 1032 | ], 1033 | "text/plain": [ 1034 | "" 1035 | ] 1036 | }, 1037 | "metadata": {}, 1038 | "output_type": "display_data" 1039 | }, 1040 | { 1041 | "data": { 1042 | "text/markdown": [ 1043 | "**Retrieved Response**" 1044 | ], 1045 | "text/plain": [ 1046 | "" 1047 | ] 1048 | }, 1049 | "metadata": {}, 1050 | "output_type": "display_data" 1051 | }, 1052 | { 1053 | "data": { 1054 | "text/markdown": [ 1055 | "A DAG, or directed acyclic graph, is a graph that consists of nodes connected by directed edges, where the edges have a specific direction and there are no cycles in the graph. In the context of Metaflow, a DAG is inferred based on the transitions between step functions. The nodes in the DAG are the steps, which represent operations, and the edges represent the transitions between steps. The DAG structure is important for defining the flow of execution and dependencies between steps in Metaflow." 1056 | ], 1057 | "text/plain": [ 1058 | "" 1059 | ] 1060 | }, 1061 | "metadata": {}, 1062 | "output_type": "display_data" 1063 | }, 1064 | { 1065 | "data": { 1066 | "text/markdown": [ 1067 | "#### Sources" 1068 | ], 1069 | "text/plain": [ 1070 | "" 1071 | ] 1072 | }, 1073 | "metadata": {}, 1074 | "output_type": "display_data" 1075 | }, 1076 | { 1077 | "data": { 1078 | "text/markdown": [ 1079 | "##### [Graph](https://docs.metaflow.org/internals/technical-overview#graph)" 1080 | ], 1081 | "text/plain": [ 1082 | "" 1083 | ] 1084 | }, 1085 | "metadata": {}, 1086 | "output_type": "display_data" 1087 | }, 1088 | { 1089 | "data": { 1090 | "text/markdown": [ 1091 | "Metaflow infers a directed (typically acyclic) graph based on the transitions between\n", 1092 | " step function..." 1093 | ], 1094 | "text/plain": [ 1095 | "" 1096 | ] 1097 | }, 1098 | "metadata": {}, 1099 | "output_type": "display_data" 1100 | }, 1101 | { 1102 | "data": { 1103 | "text/markdown": [ 1104 | "##### [The Structure of Metaflow Code](https://docs.metaflow.org/metaflow/basics#the-structure-of-metaflow-code)" 1105 | ], 1106 | "text/plain": [ 1107 | "" 1108 | ] 1109 | }, 1110 | "metadata": {}, 1111 | "output_type": "display_data" 1112 | }, 1113 | { 1114 | "data": { 1115 | "text/markdown": [ 1116 | "Metaflow follows [the dataflow\n", 1117 | " paradigm](https://en.wikipedia.org/wiki/Dataflow_programming) which ..." 1118 | ], 1119 | "text/plain": [ 1120 | "" 1121 | ] 1122 | }, 1123 | "metadata": {}, 1124 | "output_type": "display_data" 1125 | } 1126 | ], 1127 | "source": [ 1128 | "question = \"What is a DAG?\"\n", 1129 | "generative_search_engine_iter(question, index, df)" 1130 | ] 1131 | }, 1132 | { 1133 | "cell_type": "markdown", 1134 | "metadata": {}, 1135 | "source": [ 1136 | "## RAG for a generic sales pitch " 1137 | ] 1138 | }, 1139 | { 1140 | "cell_type": "markdown", 1141 | "metadata": {}, 1142 | "source": [ 1143 | "### A custom prompt template in pure Python\n", 1144 | "Many tools exist to make prompts easy to manage. [Langchain](https://www.langchain.com/) is an emerging leader in this space." 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "code", 1149 | "execution_count": 126, 1150 | "metadata": {}, 1151 | "outputs": [], 1152 | "source": [ 1153 | "prompt_template = \"\"\"\n", 1154 | "Write an introduction email to a potential technical user who works as a {role} at {company}.\n", 1155 | "\n", 1156 | "Do not directly address the user's role or company anywhere in the email.\n", 1157 | "\n", 1158 | "Write the email for a technical audience who doesn't want to read marketing copy.\n", 1159 | "\n", 1160 | "Highlight Metaflow features related to their interests including {interests}.\n", 1161 | "\n", 1162 | "Include a summary motivating the benefits of these features by summarizing this context about Metaflow:\n", 1163 | "{context_about_interests}\n", 1164 | "\n", 1165 | "Make a subtle reference that Outerbounds platform can help them with {enterprise_platform_interest_hook}.\n", 1166 | "\n", 1167 | "Include a summary motivating the benefits of Outerbounds platform by summarizing this context about Outerbounds platform:\n", 1168 | "{context_about_enterprise_platform_interest_hook}\n", 1169 | "\n", 1170 | "Make the CTA to schedule a meeting to discuss how Outerbounds platform can help them.\n", 1171 | "\n", 1172 | "Make the email as short as possible. \n", 1173 | "\n", 1174 | "Do not reference your own profession or any experiences. Do not talk about yourself.\n", 1175 | "\n", 1176 | "Do not explicitly reference the company that the receiver works for. Only implicitly use this knowledge to demonstrate knowledge of the problems their organization may face.\n", 1177 | "\n", 1178 | "Avoid speaking from the first person.\n", 1179 | "\n", 1180 | "Avoid directly saying that you know about anyone's past experience or background. \n", 1181 | "\n", 1182 | "Avoid saying anything with similar sentiment to these statements:\n", 1183 | "
 - Author Metaflow flows using notebooks\n", 1184 | "
 - Refer to an interest listed above as something Metaflow works with\n", 1185 | "\"\"\"" 1186 | ] 1187 | }, 1188 | { 1189 | "cell_type": "markdown", 1190 | "metadata": {}, 1191 | "source": [ 1192 | "### Prompt engineering 101\n", 1193 | "Inject some relevant context into our prompt template." 1194 | ] 1195 | }, 1196 | { 1197 | "cell_type": "code", 1198 | "execution_count": 127, 1199 | "metadata": {}, 1200 | "outputs": [], 1201 | "source": [ 1202 | "things_we_learned_about_a_prospect = dict(\n", 1203 | " role=\"data scientist\",\n", 1204 | " company=\"Big Industries Co.\",\n", 1205 | "\n", 1206 | " # comma-separated lists\n", 1207 | " interests=\"mlops, deep learning, kubernetes\",\n", 1208 | " enterprise_platform_interest_hook=\"CI/CD, security\",\n", 1209 | ")" 1210 | ] 1211 | }, 1212 | { 1213 | "cell_type": "markdown", 1214 | "metadata": {}, 1215 | "source": [ 1216 | "### RAGify! \n", 1217 | "Here we use the Q&A iteration you saw in previous sections, but instead of printing the results we are using them to \"augment\" the prompt." 1218 | ] 1219 | }, 1220 | { 1221 | "cell_type": "code", 1222 | "execution_count": 128, 1223 | "metadata": {}, 1224 | "outputs": [ 1225 | { 1226 | "name": "stderr", 1227 | "output_type": "stream", 1228 | "text": [ 1229 | "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:27<00:00, 9.04s/it]\n", 1230 | "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:20<00:00, 10.34s/it]\n" 1231 | ] 1232 | } 1233 | ], 1234 | "source": [ 1235 | "# RAG step - fetch some context\n", 1236 | "from tqdm import tqdm\n", 1237 | "\n", 1238 | "interest_list = things_we_learned_about_a_prospect[\"interests\"].split(\", \")\n", 1239 | "context_about_interests = []\n", 1240 | "for interest in tqdm(interest_list):\n", 1241 | " context_about_interest = qa_iter(\n", 1242 | " f\"Describe how Metaflow and {interest} can be used together in ML workflows. Focus on Metaflow being used as a complimentary tool.\",\n", 1243 | " index,\n", 1244 | " )\n", 1245 | " context_about_interests.append(context_about_interest)\n", 1246 | "\n", 1247 | "enterprise_platform_interest_hook = things_we_learned_about_a_prospect[\n", 1248 | " \"enterprise_platform_interest_hook\"\n", 1249 | "].split(\", \")\n", 1250 | "context_about_enterprise_platform_interest_hook = []\n", 1251 | "for interest in tqdm(enterprise_platform_interest_hook):\n", 1252 | " context_about_interest = qa_iter(\n", 1253 | " f\"Describe how Metaflow and {interest} can be used together in ML workflows. Focus on Metaflow being used as a complimentary tool.\",\n", 1254 | " index,\n", 1255 | " )\n", 1256 | " context_about_enterprise_platform_interest_hook.append(context_about_interest)" 1257 | ] 1258 | }, 1259 | { 1260 | "cell_type": "code", 1261 | "execution_count": 129, 1262 | "metadata": {}, 1263 | "outputs": [], 1264 | "source": [ 1265 | "# unpack context injection data to encourage the generative LLM to embed the links in its Markdown response.\n", 1266 | "\n", 1267 | "def prepare_with_links(context_list: List[Context]):\n", 1268 | " def strip_numbers_and_punctuation(header):\n", 1269 | " import string\n", 1270 | " import re\n", 1271 | "\n", 1272 | " return re.sub(\n", 1273 | " r\"\\d\", \"\", header.translate(str.maketrans(\"\", \"\", string.punctuation))\n", 1274 | " )\n", 1275 | "\n", 1276 | " context_to_inject = \"\"\"\n", 1277 | "The results will be given to you in a format like\n", 1278 | "
 - \"Text\": [Comma-separated list of [Link Label](Link URL)]\n", 1279 | "Use the [Link Label](Link URL) syntax in the summary, and use the links in context of the paragraph.\n", 1280 | "\n", 1281 | "These are the results to summarize:\n", 1282 | "\"\"\"\n", 1283 | " for _context in context_list:\n", 1284 | " similar_chunk_df = _context.get_link_df(df)\n", 1285 | " links = []\n", 1286 | " for header, url in list(\n", 1287 | " zip(similar_chunk_df.header.values, similar_chunk_df.page_url.values)\n", 1288 | " ):\n", 1289 | " links.append(f\"[{strip_numbers_and_punctuation(header)}]({url})\")\n", 1290 | " context_to_inject += (\n", 1291 | " \"
\" + \"  -\" + ' \"' + _context.response + '\": ' + \", \".join(links)\n", 1292 | " )\n", 1293 | " return context_to_inject" 1294 | ] 1295 | }, 1296 | { 1297 | "cell_type": "code", 1298 | "execution_count": 130, 1299 | "metadata": {}, 1300 | "outputs": [], 1301 | "source": [ 1302 | "prompt = prompt_template.format(\n", 1303 | " **things_we_learned_about_a_prospect,\n", 1304 | " context_about_interests=prepare_with_links(context_about_interests),\n", 1305 | " context_about_enterprise_platform_interest_hook=prepare_with_links(\n", 1306 | " context_about_enterprise_platform_interest_hook\n", 1307 | " )\n", 1308 | ")" 1309 | ] 1310 | }, 1311 | { 1312 | "cell_type": "code", 1313 | "execution_count": 131, 1314 | "metadata": {}, 1315 | "outputs": [ 1316 | { 1317 | "name": "stdout", 1318 | "output_type": "stream", 1319 | "text": [ 1320 | "Enter a negative sentiment to avoid by append to the prompt with an 'avoid' instruction, or press enter to continue, or type r then enter to restart: \n" 1321 | ] 1322 | } 1323 | ], 1324 | "source": [ 1325 | "# append negative sentiment escapes to prompt.\n", 1326 | "# treat these like UX escape hatches, where you can always just stuff extra things in, until you reach the model's context width.\n", 1327 | "user_interaction = True\n", 1328 | "negative_sentiment_context_lines = []\n", 1329 | "while user_interaction:\n", 1330 | "\n", 1331 | " if len(negative_sentiment_context_lines) == 0:\n", 1332 | " user_input = input(\n", 1333 | " \"Enter a negative sentiment to avoid by append to the prompt with an 'avoid' instruction, or press enter to continue, or type r then enter to restart: \"\n", 1334 | " )\n", 1335 | " else:\n", 1336 | " print(\"Current list of negative sentiment statements:\")\n", 1337 | " for line in negative_sentiment_context_lines:\n", 1338 | " print(line)\n", 1339 | " user_input = input(\n", 1340 | " \"Single enter to continue, double enter to complete list of results of negative sentiment statements.\"\n", 1341 | " )\n", 1342 | " # process user input\n", 1343 | " if user_input == \"\":\n", 1344 | " user_interaction = False\n", 1345 | " elif user_input == \"r\":\n", 1346 | " negative_sentiment_context_lines = []\n", 1347 | " else:\n", 1348 | " negative_sentiment_context_lines.append(user_input)\n", 1349 | "\n", 1350 | "for line in negative_sentiment_context_lines:\n", 1351 | " prompt += \"
\" + \"  -\" + \" \" + line" 1352 | ] 1353 | }, 1354 | { 1355 | "cell_type": "markdown", 1356 | "metadata": {}, 1357 | "source": [ 1358 | "### View the final prompt the model will see" 1359 | ] 1360 | }, 1361 | { 1362 | "cell_type": "code", 1363 | "execution_count": 132, 1364 | "metadata": {}, 1365 | "outputs": [ 1366 | { 1367 | "data": { 1368 | "text/markdown": [ 1369 | "##### The first five hundred chars" 1370 | ], 1371 | "text/plain": [ 1372 | "" 1373 | ] 1374 | }, 1375 | "metadata": {}, 1376 | "output_type": "display_data" 1377 | }, 1378 | { 1379 | "data": { 1380 | "text/markdown": [ 1381 | "\n", 1382 | "Write an introduction email to a potential technical user who works as a data scientist at Big Industries Co..\n", 1383 | "\n", 1384 | "Do not directly address the user's role or company anywhere in the email.\n", 1385 | "\n", 1386 | "Write the email for a technical audience who doesn't want to read marketing copy.\n", 1387 | "\n", 1388 | "Highlight Metaflow features related to their interests including mlops, deep learning, kubernetes.\n", 1389 | "\n", 1390 | "Include a summary motivating the benefits of these features by summarizing this context about Metaflow:\n", 1391 | "\n", 1392 | "The results will be give..." 1393 | ], 1394 | "text/plain": [ 1395 | "" 1396 | ] 1397 | }, 1398 | "metadata": {}, 1399 | "output_type": "display_data" 1400 | } 1401 | ], 1402 | "source": [ 1403 | "dm(\"##### The first five hundred chars\") \n", 1404 | "dm(prompt[:500] + '...')" 1405 | ] 1406 | }, 1407 | { 1408 | "cell_type": "markdown", 1409 | "metadata": {}, 1410 | "source": [ 1411 | "### See your RAG app in action! " 1412 | ] 1413 | }, 1414 | { 1415 | "cell_type": "code", 1416 | "execution_count": 133, 1417 | "metadata": {}, 1418 | "outputs": [ 1419 | { 1420 | "data": { 1421 | "text/markdown": [ 1422 | "##### Generated email" 1423 | ], 1424 | "text/plain": [ 1425 | "" 1426 | ] 1427 | }, 1428 | "metadata": {}, 1429 | "output_type": "display_data" 1430 | }, 1431 | { 1432 | "data": { 1433 | "text/markdown": [ 1434 | "Subject: Enhancing ML Workflows with Metaflow and Outerbounds Platform\n", 1435 | "\n", 1436 | "Dear [Recipient],\n", 1437 | "\n", 1438 | "I hope this email finds you well. I wanted to reach out to introduce you to Metaflow, a powerful tool that can enhance your data science workflows and help you achieve scalability, reproducibility, and production-readiness.\n", 1439 | "\n", 1440 | "Metaflow can be used as a complimentary tool in ML workflows, particularly when combined with MLOps practices. By integrating Metaflow into your pipeline, you can leverage its capabilities to build and deploy ML models more efficiently. It takes care of low-level infrastructure such as data, compute, orchestration, and versioning, allowing you to focus on the fun parts of building applications and models. [Metaflow takes care of the plumbing so you can focus on the fun parts](https://docs.metaflow.org/introduction/why-metaflow#10-metaflow-takes-care-of-the-plumbing-so-you-can-focus-on-the-fun-parts).\n", 1441 | "\n", 1442 | "For deep learning projects, Metaflow provides a robust and user-friendly foundation. It covers the full stack of DS/ML infrastructure, allowing you to focus on iterating on ideas quickly and deploying them confidently. [Metaflow covers the full stack of DS/ML infrastructure](https://docs.metaflow.org/introduction/why-metaflow#9-metaflow-covers-the-full-stack-of-ds-ml-infrastructure).\n", 1443 | "\n", 1444 | "If you're working with Kubernetes, Metaflow seamlessly integrates with it to leverage scalable infrastructure for running ML/DS applications. This makes it suitable for both small and large organizations. [Metaflow relies on systems that engineers know and trust](https://docs.metaflow.org/introduction/why-metaflow#11-metaflow-relies-on-systems-that-engineers-know-and-trust).\n", 1445 | "\n", 1446 | "In addition to Metaflow, I wanted to mention Outerbounds Platform, which can further enhance your ML workflows. It offers CI/CD capabilities, ensuring that changes to ML models and data pipelines are thoroughly tested and deployed in a reliable and efficient manner. Outerbounds Platform also prioritizes security, respecting your company's security policies and providing a secure environment for executing data science projects.\n", 1447 | "\n", 1448 | "I would love to schedule a meeting to discuss how Metaflow and Outerbounds Platform can specifically benefit your organization and address any challenges you may be facing. Please let me know a time that works for you, and I will be happy to set up a call.\n", 1449 | "\n", 1450 | "Looking forward to hearing from you.\n", 1451 | "\n", 1452 | "Best regards,\n", 1453 | "[Your Name]" 1454 | ], 1455 | "text/plain": [ 1456 | "" 1457 | ] 1458 | }, 1459 | "metadata": {}, 1460 | "output_type": "display_data" 1461 | } 1462 | ], 1463 | "source": [ 1464 | "query_engine = index.as_query_engine()\n", 1465 | "query_res = query_engine.query(prompt)\n", 1466 | "response = query_res.response\n", 1467 | "dm(\"##### Generated email\")\n", 1468 | "dm(response)" 1469 | ] 1470 | }, 1471 | { 1472 | "cell_type": "markdown", 1473 | "metadata": {}, 1474 | "source": [ 1475 | "## Chatbot" 1476 | ] 1477 | }, 1478 | { 1479 | "cell_type": "code", 1480 | "execution_count": 33, 1481 | "metadata": {}, 1482 | "outputs": [], 1483 | "source": [ 1484 | "repos = [\n", 1485 | " {\n", 1486 | " \"deployment_url\": \"docs.metaflow.org\",\n", 1487 | " \"repository_path\": os.path.expanduser(\"~/Dev/metaflow-docs\"),\n", 1488 | " \"repository_ref\": \"master\",\n", 1489 | " \"base_search_path\": \"docs\",\n", 1490 | " \"exclude_paths\": [\"docs/v\"],\n", 1491 | " \"exclude_files\": [\"README.md\", \"README\"],\n", 1492 | " }\n", 1493 | "]" 1494 | ] 1495 | }, 1496 | { 1497 | "cell_type": "code", 1498 | "execution_count": 34, 1499 | "metadata": {}, 1500 | "outputs": [], 1501 | "source": [ 1502 | "def local_repos_to_docs(repos):\n", 1503 | " documents = []\n", 1504 | " for params in repos:\n", 1505 | " md_files = DocumentationExtractor().filter_files(\n", 1506 | " params[\"repository_path\"],\n", 1507 | " base_search_path=params[\"base_search_path\"],\n", 1508 | " exclude_paths=params[\"exclude_paths\"],\n", 1509 | " exclude_files=params[\"exclude_files\"],\n", 1510 | " considered_extensions=[\".md\"]\n", 1511 | " )\n", 1512 | " md_files = [abs_path for abs_path, _ in md_files]\n", 1513 | " documents += get_documents_from_md_file_paths(md_files)\n", 1514 | " return documents" 1515 | ] 1516 | }, 1517 | { 1518 | "cell_type": "code", 1519 | "execution_count": 35, 1520 | "metadata": {}, 1521 | "outputs": [ 1522 | { 1523 | "data": { 1524 | "text/plain": [ 1525 | "[Document(id_='818c7d59-7d59-4c9a-8e26-be8fa66929f6', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='168707b49cebf5424ed48615defbba683a0d7932f8451d263d491040c37aa87f', text='\\n\\nWelcome to Metaflow\\n\\nMetaflow makes it easy to build and manage real-life data science and machine learning projects.\\n\\n\\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n", 1526 | " Document(id_='baebefe9-9cf4-4c6b-8831-3fec827af036', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='0c733db8a7e3a129b3c47977e53aa8d0ee5e0942e768f40d822a65e1da4f9aea', text='\\n\\nMotivation\\n\\n- Why Metaflow\\n- What is Metaflow\\n- Metaflow Resources\\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]" 1527 | ] 1528 | }, 1529 | "execution_count": 35, 1530 | "metadata": {}, 1531 | "output_type": "execute_result" 1532 | } 1533 | ], 1534 | "source": [ 1535 | "documents = local_repos_to_docs(repos)\n", 1536 | "documents[:2]" 1537 | ] 1538 | }, 1539 | { 1540 | "cell_type": "code", 1541 | "execution_count": 36, 1542 | "metadata": {}, 1543 | "outputs": [], 1544 | "source": [ 1545 | "temp = 0.0\n", 1546 | "model = \"gpt-3.5-turbo\"\n", 1547 | "chat_mode = \"react\"" 1548 | ] 1549 | }, 1550 | { 1551 | "cell_type": "code", 1552 | "execution_count": 37, 1553 | "metadata": {}, 1554 | "outputs": [], 1555 | "source": [ 1556 | "service_context = ServiceContext.from_defaults(llm=OpenAI(model=model, temperature=temp))\n", 1557 | "index = VectorStoreIndex.from_documents(documents, service_context=service_context)" 1558 | ] 1559 | }, 1560 | { 1561 | "cell_type": "code", 1562 | "execution_count": 38, 1563 | "metadata": {}, 1564 | "outputs": [], 1565 | "source": [ 1566 | "chat_engine = index.as_chat_engine(service_context=service_context, chat_mode=chat_mode, verbose=True)" 1567 | ] 1568 | }, 1569 | { 1570 | "cell_type": "code", 1571 | "execution_count": 48, 1572 | "metadata": {}, 1573 | "outputs": [ 1574 | { 1575 | "name": "stdout", 1576 | "output_type": "stream", 1577 | "text": [ 1578 | "\u001b[38;5;200m\u001b[1;3mThought: I need to use a tool to help me answer the question.\n", 1579 | "Action: query_engine_tool\n", 1580 | "Action Input: {'input': 'How does Metaflow help AI developers?'}\n", 1581 | "\u001b[0m\u001b[36;1m\u001b[1;3mObservation: Metaflow helps AI developers by providing a robust and user-friendly foundation for data-intensive applications. It takes care of the low-level infrastructure such as data, compute, orchestration, and versioning, allowing developers to focus on building their own applications, models, and policies on top of it. This means that AI developers who have a basic understanding of Python can leverage Metaflow to streamline their development process and focus on the more enjoyable aspects of their work.\n", 1582 | "\u001b[0m" 1583 | ] 1584 | } 1585 | ], 1586 | "source": [ 1587 | "response = chat_engine.chat(\"How does Metaflow help AI developers?\")" 1588 | ] 1589 | }, 1590 | { 1591 | "cell_type": "code", 1592 | "execution_count": 34, 1593 | "metadata": {}, 1594 | "outputs": [ 1595 | { 1596 | "data": { 1597 | "text/markdown": [ 1598 | "Metaflow helps AI developers by providing a robust and user-friendly foundation for data-intensive applications. It takes care of the low-level infrastructure such as data, compute, orchestration, and versioning, allowing developers to focus on building their own applications, models, and policies on top of it. This means that AI developers who have a basic understanding of Python can leverage Metaflow to streamline their development process and focus on the more enjoyable aspects of their work." 1599 | ], 1600 | "text/plain": [ 1601 | "" 1602 | ] 1603 | }, 1604 | "metadata": {}, 1605 | "output_type": "display_data" 1606 | } 1607 | ], 1608 | "source": [ 1609 | "dm(response.response)" 1610 | ] 1611 | }, 1612 | { 1613 | "cell_type": "code", 1614 | "execution_count": 36, 1615 | "metadata": {}, 1616 | "outputs": [ 1617 | { 1618 | "name": "stdout", 1619 | "output_type": "stream", 1620 | "text": [ 1621 | "\u001b[38;5;200m\u001b[1;3mThought: I have already answered this question. I can provide a summary of my previous response to help answer the question again.\n", 1622 | "Action: query_engine_tool\n", 1623 | "Action Input: {'input': \"The enjoyable aspects of AI developers' work\"}\n", 1624 | "\u001b[0m\u001b[36;1m\u001b[1;3mObservation: The enjoyable aspects of AI developers' work include the ability to create value with machine learning, similar to traditional software engineering. The development and deployment of ML systems are expected to mature and improve over time, just as software development has done in the past 20 years. Efforts from various entities, such as governments, open source communities, and for-profit companies, are being made to articulate risks, develop best practices, and provide tooling to support ML development. This indicates a positive prognosis for the future of AI development.\n", 1625 | "\u001b[0m\u001b[38;5;200m\u001b[1;3mResponse: The enjoyable aspects of AI developers' work include the ability to create value with machine learning, the opportunity for continuous learning and staying updated with the latest advancements, the potential for impactful applications, collaboration with interdisciplinary teams, the freedom to experiment and innovate, and the automation of repetitive tasks. Additionally, efforts from various entities are being made to support and improve the development and deployment of ML systems, indicating a positive prognosis for the future of AI development.\n", 1626 | "\u001b[0m" 1627 | ] 1628 | }, 1629 | { 1630 | "data": { 1631 | "text/markdown": [ 1632 | "The enjoyable aspects of AI developers' work include the ability to create value with machine learning, the opportunity for continuous learning and staying updated with the latest advancements, the potential for impactful applications, collaboration with interdisciplinary teams, the freedom to experiment and innovate, and the automation of repetitive tasks. Additionally, efforts from various entities are being made to support and improve the development and deployment of ML systems, indicating a positive prognosis for the future of AI development." 1633 | ], 1634 | "text/plain": [ 1635 | "" 1636 | ] 1637 | }, 1638 | "metadata": {}, 1639 | "output_type": "display_data" 1640 | } 1641 | ], 1642 | "source": [ 1643 | "response = chat_engine.chat(\"What are the enjoyable aspects of their work?\")\n", 1644 | "dm(response.response)" 1645 | ] 1646 | }, 1647 | { 1648 | "cell_type": "code", 1649 | "execution_count": 69, 1650 | "metadata": {}, 1651 | "outputs": [], 1652 | "source": [ 1653 | "# chat_engine.chat_repl()" 1654 | ] 1655 | } 1656 | ], 1657 | "metadata": { 1658 | "kernelspec": { 1659 | "display_name": "RAG demo", 1660 | "language": "python", 1661 | "name": "rag-demo" 1662 | }, 1663 | "language_info": { 1664 | "codemirror_mode": { 1665 | "name": "ipython", 1666 | "version": 3 1667 | }, 1668 | "file_extension": ".py", 1669 | "mimetype": "text/x-python", 1670 | "name": "python", 1671 | "nbconvert_exporter": "python", 1672 | "pygments_lexer": "ipython3", 1673 | "version": "3.11.4" 1674 | } 1675 | }, 1676 | "nbformat": 4, 1677 | "nbformat_minor": 4 1678 | } 1679 | -------------------------------------------------------------------------------- /notebooks/markdown_repo_explorer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Extending these workflows to other repositories\n", 8 | "**IMPORTANT**: Please be respectful of understanding the licenses of repositories you choose to crawl with any tools, including those provided in this demonstration. This repository is intended to help you quickly get started indexing documentation in GitHub repositories, not for large-scale web scraping." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "## The main tool" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": { 22 | "vscode": { 23 | "languageId": "plaintext" 24 | } 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "from rag_tools.repo.ops import DocumentationExtractor" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "The `DocumentationExtractor` has two functions, `filter_files` and `extract`." 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Finding files to chunk\n", 43 | "First, a set of parameters defining how to crawl a GitHub repository is passed to a custom object defined in `rag_tools.repo.ops.DocumentationExtractor`. One option is to clone a repository and pass a local path to `filter_files` to return all the paths to the `considered_extensions`" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 7, 49 | "metadata": { 50 | "vscode": { 51 | "languageId": "plaintext" 52 | } 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "import os\n", 57 | "METAFLOW_DOCS_REPO_PATH = os.path.expanduser(\"~/Dev/metaflow-docs\")\n", 58 | "file_paths = DocumentationExtractor().filter_files(\n", 59 | " METAFLOW_DOCS_REPO_PATH,\n", 60 | " base_search_path = \"docs\",\n", 61 | " exclude_paths = [\"docs/v\"],\n", 62 | " exclude_files = [\"README.md\", \"README\"],\n", 63 | " considered_extensions = [\".md\"],\n", 64 | ")" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 9, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "(58, ('/Users/eddie/Dev/metaflow-docs/docs/index.md', 'index.md'))" 76 | ] 77 | }, 78 | "execution_count": 9, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "len(file_paths), file_paths[0]" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "Another workflow option is to only download the repository temporarily by passing the remote repositories URL to `extract`, and letting this function handle the `filter_files` call and any other parsing internally." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 19, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "import pandas as pd\n", 101 | "from functools import partial\n", 102 | "from rag_tools.filetypes.markdown import parse_md_file_headers\n", 103 | "\n", 104 | "# we will pass this to the extract(parser=...) arg. \n", 105 | "# you can replace this with any function that takes in filepath and filename args.\n", 106 | "def headers_df_parser(filepath, filename, deployment_url, subdir) -> pd.DataFrame:\n", 107 | " return parse_md_file_headers(filepath, deployment_url=deployment_url, subdir=subdir)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 20, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "params = {\n", 117 | " \"deployment_url\": \"docs.metaflow.org\",\n", 118 | " \"repository_path\": \"https://github.com/Netflix/metaflow-docs\",\n", 119 | " \"repository_ref\": \"master\",\n", 120 | " \"base_search_path\": \"docs\",\n", 121 | " \"exclude_paths\": [\"docs/v\"],\n", 122 | " \"exclude_files\": [\"README.md\", \"README\"],\n", 123 | "}" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 21, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "extractor = DocumentationExtractor(repo_url=params[\"repository_path\"])\n", 133 | "\n", 134 | "dfs = extractor.extract(\n", 135 | " base_path=params[\"base_search_path\"],\n", 136 | " ref=params[\"repository_ref\"],\n", 137 | " exclude_paths=params[\"exclude_paths\"],\n", 138 | " exclude_files=params[\"exclude_files\"],\n", 139 | " considered_extensions=[\".md\"],\n", 140 | " parser=partial(headers_df_parser, deployment_url=params['deployment_url'], subdir=params['base_search_path']),\n", 141 | ")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 24, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "df = pd.concat(dfs)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 26, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/html": [ 161 | "
\n", 162 | "\n", 175 | "\n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | "
headercontentstypepage_urlis_howtochar_countword_count
10**Store and load objects to/from a known S3 lo...The above examples inferred the S3 location ba...H4https://docs.metaflow.org/scaling/data#store-a...False1367219
0Deploying Variants of Event-Triggered FlowsConsider this advanced scenario: You have depl...H1https://docs.metaflow.org/production/event-tri...False983119
99[Fix `environment is not callable` error when ...Using `@environment` would often result in an ...H4https://docs.metaflow.org/internals/release-no...False27934
71State Machine execution history logging for AW...Metaflow now logs [State Machine execution his...H4https://docs.metaflow.org/internals/release-no...False58772
3Single Flow, multiple developersIf `ProjectFlow` did not have a `@project deco...H3https://docs.metaflow.org/production/coordinat...False1773232
\n", 241 | "
" 242 | ], 243 | "text/plain": [ 244 | " header \\\n", 245 | "10 **Store and load objects to/from a known S3 lo... \n", 246 | "0 Deploying Variants of Event-Triggered Flows \n", 247 | "99 [Fix `environment is not callable` error when ... \n", 248 | "71 State Machine execution history logging for AW... \n", 249 | "3 Single Flow, multiple developers \n", 250 | "\n", 251 | " contents type \\\n", 252 | "10 The above examples inferred the S3 location ba... H4 \n", 253 | "0 Consider this advanced scenario: You have depl... H1 \n", 254 | "99 Using `@environment` would often result in an ... H4 \n", 255 | "71 Metaflow now logs [State Machine execution his... H4 \n", 256 | "3 If `ProjectFlow` did not have a `@project deco... H3 \n", 257 | "\n", 258 | " page_url is_howto char_count \\\n", 259 | "10 https://docs.metaflow.org/scaling/data#store-a... False 1367 \n", 260 | "0 https://docs.metaflow.org/production/event-tri... False 983 \n", 261 | "99 https://docs.metaflow.org/internals/release-no... False 279 \n", 262 | "71 https://docs.metaflow.org/internals/release-no... False 587 \n", 263 | "3 https://docs.metaflow.org/production/coordinat... False 1773 \n", 264 | "\n", 265 | " word_count \n", 266 | "10 219 \n", 267 | "0 119 \n", 268 | "99 34 \n", 269 | "71 72 \n", 270 | "3 232 " 271 | ] 272 | }, 273 | "execution_count": 26, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "df.sample(5)" 280 | ] 281 | } 282 | ], 283 | "metadata": { 284 | "kernelspec": { 285 | "display_name": "RAG demo", 286 | "language": "python", 287 | "name": "rag-demo" 288 | }, 289 | "language_info": { 290 | "codemirror_mode": { 291 | "name": "ipython", 292 | "version": 3 293 | }, 294 | "file_extension": ".py", 295 | "mimetype": "text/x-python", 296 | "name": "python", 297 | "nbconvert_exporter": "python", 298 | "pygments_lexer": "ipython3", 299 | "version": "3.11.4" 300 | } 301 | }, 302 | "nbformat": 4, 303 | "nbformat_minor": 4 304 | } 305 | -------------------------------------------------------------------------------- /rag_tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/__init__.py -------------------------------------------------------------------------------- /rag_tools/databases/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/databases/__init__.py -------------------------------------------------------------------------------- /rag_tools/databases/vector_database.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | class AbstractDB: 5 | 6 | def __init__(self): 7 | pass 8 | 9 | def create_index(self, **kwargs): 10 | pass 11 | 12 | def upsert(self, **kwargs): 13 | pass 14 | 15 | def vector_search(self, **kwargs): 16 | pass 17 | 18 | def destroy_index(self, **kwargs): 19 | pass 20 | 21 | 22 | class PineconeDB(AbstractDB): 23 | 24 | def __init__(self,): 25 | super().__init__() 26 | import pinecone 27 | pinecone.init( 28 | api_key=os.environ['PINECONE_API_KEY'], 29 | environment=os.environ['GCP_ENVIRONMENT'] 30 | ) 31 | 32 | def create_index(self, index_name, dimension, metric='cosine'): 33 | import pinecone 34 | 35 | if index_name not in pinecone.list_indexes(): 36 | pinecone.create_index( 37 | name=index_name, 38 | dimension=dimension, 39 | metric='cosine' 40 | ) 41 | # wait a moment for the index to be fully initialized 42 | time.sleep(1) 43 | 44 | def upsert(self, index_name, embeddings, texts, ids): 45 | import pinecone 46 | 47 | # now connect to the index 48 | index = pinecone.GRPCIndex(index_name) 49 | 50 | # upsert the vectors, but this should be done in batches not one by one. 51 | print("Upserting vectors", end="") 52 | for idx, (txt, emb) in enumerate(zip(texts, embeddings)): 53 | upsert_response = index.upsert( 54 | vectors=[ 55 | {'id': f'vec{idx}', 56 | 'values': emb.tolist(), 57 | 'metadata': {'text': txt}, 58 | } 59 | ] 60 | ) 61 | print(".", end="") 62 | 63 | def vector_search(self, index_name, vector, k=1): 64 | import pinecone 65 | index = pinecone.GRPCIndex(index_name) 66 | xc = index.query(vector.tolist(), top_k=k, include_metadata=True) 67 | return xc 68 | 69 | def destroy_index(self, index_name): 70 | import pinecone 71 | pinecone.delete_index(index_name) 72 | 73 | 74 | class LanceDB(AbstractDB): 75 | 76 | """ 77 | LanceDB is a vector database that uses Lance to store and search vectors. 78 | """ 79 | 80 | def __init__(self): 81 | super().__init__() 82 | self.mode = 'overwrite' 83 | self.dataset_path = "../../chatbot.lance" 84 | self.local_store = True 85 | 86 | def create_index(self, table_name, embeddings, texts, ids): 87 | 88 | import lance 89 | import pandas as pd 90 | import pyarrow as pa 91 | from lance.vector import vec_to_table 92 | 93 | data = pd.DataFrame({"text": texts, "id": ids}) 94 | table = vec_to_table(embeddings) 95 | combined = pa.Table.from_pandas(data).append_column("vector", table["vector"]) 96 | 97 | if self.local_store: 98 | ds = lance.write_dataset(combined, self.dataset_path, mode=self.mode) 99 | 100 | def upsert(self, table_name, embeddings, texts, ids): 101 | raise NotImplementedError("This LanceDB wrapper does not have upsert functionality beyond the create_index step yet.") 102 | 103 | def vector_search(self, table_name, vector, k=3): 104 | import lance 105 | ds = lance.dataset(self.dataset_path) 106 | return ds.to_table( 107 | nearest={ 108 | "column": "vector", 109 | "k": k, 110 | "q": vector, 111 | "nprobes": 20, 112 | "refine_factor": 100 113 | }).to_pandas() 114 | 115 | def destroy_index(self, table_name): 116 | if self.local_store: 117 | import shutil 118 | shutil.rmtree(self.dataset_path) -------------------------------------------------------------------------------- /rag_tools/embedders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/embedders/__init__.py -------------------------------------------------------------------------------- /rag_tools/embedders/embedder.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class AbstractEmbedder: 4 | def __init__(self, **kwargs): 5 | pass 6 | 7 | class OpenAIEmbedder(AbstractEmbedder): 8 | 9 | def __init__(self): 10 | super().__init__() 11 | 12 | def embed(self, sentences): 13 | pass 14 | 15 | 16 | class SentenceTransformerEmbedder(AbstractEmbedder): 17 | def __init__(self, model_path, device="cpu"): 18 | super().__init__() 19 | from sentence_transformers import SentenceTransformer 20 | 21 | self.model = SentenceTransformer(model_path) 22 | self.model.to(device) 23 | 24 | def embed(self, sentences): 25 | return self.model.encode(sentences, show_progress_bar=True) 26 | 27 | 28 | class FashionCLIPEmbedder(AbstractEmbedder): 29 | def __init__(self): 30 | super().__init__() 31 | from fashion_clip.fashion_clip import FashionCLIP 32 | 33 | self.fclip = FashionCLIP('fashion-clip') 34 | 35 | def embed(self, images): 36 | import numpy as np 37 | image_embeddings = self.fclip.encode_images(images, batch_size=32) 38 | 39 | image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True) 40 | return image_embeddings 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /rag_tools/filetypes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/filetypes/__init__.py -------------------------------------------------------------------------------- /rag_tools/filetypes/markdown.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from functools import partial 4 | import os 5 | import pickle 6 | from datetime import datetime 7 | import frontmatter 8 | import re 9 | import pandas as pd 10 | from slugify import slugify 11 | 12 | 13 | HEADERS_REGEX = re.compile("#{1,5}") 14 | NUMBERED_HEADER_REGEX = re.compile("") 15 | END_ESCAPE = "< END >" 16 | QUESTION_HEADER = "## Question" 17 | 18 | 19 | def is_useful_line(line): 20 | """ 21 | Utility function to decide if this is some autogenerated or unhelpful line from filetypes pages, which can be generated in a variety of ways. 22 | """ 23 | # if line == "\n": # New lines are helpful in the sections to determine paragraph chunks. Might want to make this a generalized filter instead of hand coded heuristics. 24 | # return False 25 | if line == "---\n": 26 | return False 27 | if line == "\n---": 28 | return False 29 | if "