├── .gitignore
├── LICENSE
├── README.md
├── chat_app.py
├── data
    ├── llama-index-tutorial
    │   └── paul_graham_essay.txt
    ├── processed_df_1692826394541260.csv
    └── test-data
    │   ├── chunked_by_header.csv
    │   ├── mf-docs
    │       └── docs
    │       │   ├── api
    │       │       └── current.md
    │       │   └── introduction
    │       │       └── why-metaflow.md
    │   └── ob
    │       ├── blog
    │           ├── llm-tuning-metaflow.md
    │           └── metaflow-fast-data.md
    │       └── docs
    │           └── data-science
    │               ├── compute
    │                   └── use_multiple_cpu_cores.md
    │               └── orchestration
    │                   └── dags_for_data_science.md
├── environment.yaml
├── flows
    ├── config
    │   └── repo_params.py
    ├── data_table_processor.py
    ├── lancedb_index.py
    ├── markdown_chunker.py
    ├── pinecone_index.py
    ├── test_simple_embedding.py
    └── test_simple_query.py
├── images
    └── cover.png
├── notebooks
    ├── analyze_chunks.ipynb
    ├── llama_index_exploration.ipynb
    └── markdown_repo_explorer.ipynb
├── rag_tools
    ├── __init__.py
    ├── databases
    │   ├── __init__.py
    │   └── vector_database.py
    ├── embedders
    │   ├── __init__.py
    │   └── embedder.py
    ├── filetypes
    │   ├── __init__.py
    │   └── markdown.py
    ├── llms
    │   ├── __init__.py
    │   └── llms_search.py
    └── repo
    │   ├── __init__.py
    │   └── ops.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | rag_tools/filetypes/__pycache__
 2 | rag_tools/config/__pycache__
 3 | rag_tools/repo/__pycache__
 4 | rag_tools/__pycache__
 5 | rag_tools/databases/__pycache__/
 6 | rag_tools/embedders/__pycache__/
 7 | rag_tools/llms/__pycache__/
 8 | rag_tools.egg-info
 9 | notebooks/.ipynb_checkpoints
10 | .metaflow
11 | rag_demo.egg-info/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Read this blog post for context: [Retrieval-Augmented Generation: How to Use Your Data to Guide LLMs](https://outerbounds.com/blog/retrieval-augmented-generation/) 
 2 | 
 3 | <a href="https://outerbounds.com/blog/retrieval-augmented-generation/">
 4 | <img style="display: block; float: left; max-width: 20%; height: auto; margin: auto; float: none!important;" src="images/cover.png"/>
 5 | </a>
 6 | 
 7 | 
 8 | # 📦  Setup
 9 | 
10 | ## With Conda, Mamba, or Micromamba
11 | ```
12 | conda env create -f environment.yaml
13 | ```
14 | 
15 | ## Without Conda
16 | ```
17 | pip install -r requirements.txt
18 | ```
19 | 
20 | # 🔀🔁 Data collection workflows
21 | 
22 | ## Chunk Markdown files in GitHub repositories
23 | 
24 | Run the repository extraction and Markdown chunking flow.
25 | Looks in `/flows/config/repo_params.py` to specify new configurations; be mindful of repository licensing!
26 | ```
27 | python flows/markdown_chunker.py run
28 | ```
29 | 
30 | ## Post-process the results
31 | 
32 | Post-process the results of the `MarkdownChunker` flow. 
33 | ```
34 | python flows/data_table_processor.py run
35 | ```
36 | 
37 | ## Analyze what just happened
38 | Let's inspect the results of these workflows in a notebook.
39 | 
40 | > If you are in a Conda environment, you need to install the kernel like so before opening the notebooks:
41 | > ```
42 | > pip install ipykernel
43 | > python -m ipykernel install --user --name rag-demo --display-name "RAG demo"
44 | > ```
45 | 
46 | Open `notebooks/analyze_chunks.ipynb` to use the Metaflow Client API to explore the results of the flow runs:
47 | ```
48 | jupyter notebook notebooks/analyze_chunks.ipynb
49 | ```
50 | 
51 | # 🦙🧠 RAG time
52 | 
53 | ## LlamaIndex
54 | Now that we have indexed a bunch of source objects, let's index them and use them to modify an LLM context window.
55 | 
56 | You can either use a Llama 2 model, or OpenAI APIs. 
57 | Letting LlamaIndex use Llama 2 runs locally, so it doesn't require an API key but takes a lot longer.
58 | Using OpenAI APIs is faster and cheap. You can create/find your API key [here](https://platform.openai.com/account/api-keys).
59 | 
60 | Open `notebooks/llama_index_exploration.ipynb`
61 | ```
62 | jupyter notebook notebooks/llama_index_exploration.ipynb
63 | ```
64 | 
65 | ### ▶️ Run a demo in Streamlit
66 | 
67 | #### 🔑 Get an OpenAI API Key
68 | Go [here](https://platform.openai.com/account/api-keys), copy your key value, and set the following environment variable:
69 | ```
70 | export OPENAI_API_KEY=<YOUR KEY>
71 | ```
72 | 
73 | #### 🤖 Run the app
74 | ```
75 | streamlit run chat_app.py
76 | ```
77 | 
78 | ## Setup for indexing workflows
79 | There are two indexing workflows in the `/flows` folder, one indexes into Pinecone for a VectorDB and another that uses the open-source LanceDB.
80 |  
81 | ### 🔑 Get a Pinecone API Key
82 | Go [here](https://app.pinecone.io/), create a Pinecone account if you have to, copy your API key, and set the following environment variable:
83 | ```
84 | export PINECONE_API_KEY=<YOUR KEY>
85 | ```
86 | 
87 | Set the following environment variable too:
88 | ```
89 | export GCP_ENVIRONMENT=us-central1-gcp
90 | ```
91 | 
92 | ### ▶️ Run the flows
93 | ```
94 | python flows/pinecone_index.py run
95 | python flows/lancedb_index.py run
96 | ```


--------------------------------------------------------------------------------
/chat_app.py:
--------------------------------------------------------------------------------
  1 | from llama_index import Document, VectorStoreIndex, set_global_service_context, ServiceContext
  2 | from llama_index.retrievers import VectorIndexRetriever
  3 | from llama_index.response_synthesizers import get_response_synthesizer
  4 | from llama_index.llms import OpenAI
  5 | from dataclasses import dataclass
  6 | from metaflow import Flow
  7 | import streamlit as st
  8 | from typing import List
  9 | import time
 10 | import os
 11 | 
 12 | st.set_page_config(layout="centered")
 13 | st.title("Metaflow Chat Bot")
 14 | st.markdown("This is a bare bones LLM-powered chat bot that uses the results of Metaflow workflows to answer questions about Metaflow.")
 15 | 
 16 | subquery_prompt = """
 17 |     Answer this question only if there is relevant context below: {}
 18 |     If there is nothing in the context say: "Could not find relevant context."
 19 |     Here is the retrieved context: {}
 20 | """
 21 | 
 22 | # model = st.text_input('OpenAI model', 'gpt-3.5-turbo')
 23 | # temp = st.slider(label='Temperature', min_value=0.0, max_value=1.0, step=0.01, value=0.0)
 24 | # chat_mode = st.text_input('LlamaIndex chat engine mode', 'react')
 25 | # K = st.number_input('K results to return', min_value=1, max_value=5, value=2, step=1)
 26 | model = 'gpt-3.5-turbo'
 27 | temp = 0.0
 28 | chat_mode = 'react'
 29 | K = 2
 30 | 
 31 | llm = OpenAI(model=model, temperature=temp, max_tokens=2048)
 32 | service_context = ServiceContext.from_defaults(llm=llm)
 33 | set_global_service_context(service_context)
 34 | 
 35 | # find latest Metaflow run that saved processed df
 36 | run = None
 37 | for _run in Flow('DataTableProcessor'):
 38 |     if _run.data.save_processed_df:
 39 |         run = _run
 40 |         break
 41 | st.write("Found processed df in run: {}".format(run.id))
 42 | 
 43 | @dataclass
 44 | class Context:
 45 | 
 46 |     def __init__(self, response: str, source_node_ids: List[str]):
 47 |         self.response = response
 48 |         self.source_node_ids = source_node_ids
 49 | 
 50 |     def get_link_df(self, meta_df, link_col = 'doc_id'):
 51 |         return meta_df[meta_df[link_col].isin(self.source_node_ids)]
 52 | 
 53 | def qa_iter(
 54 |     question: str, 
 55 |     index: VectorStoreIndex, 
 56 |     k:int = 2, 
 57 |     response_mode:str = 'tree_summarize'
 58 | ) -> Context:
 59 |     "Match a question against an index and returns the response."
 60 |     retriever = VectorIndexRetriever(index=index, similarity_top_k=k)
 61 |     response_synthesizer = get_response_synthesizer(response_mode=response_mode)
 62 |     query_engine = index.as_query_engine(response_synthesizer=response_synthesizer, retriever=retriever)
 63 |     query_res = query_engine.query(question)
 64 |     return Context(
 65 |         response=query_res.response, source_node_ids=list(query_res.metadata.keys())
 66 |     )
 67 | 
 68 | def st_output_format(similar_chunk_df):
 69 |     md_outputs = ["#### You may find these links helpful:"]
 70 |     for _, chunk in similar_chunk_df.iterrows():
 71 |         md_outputs.append(f"##### [{chunk.header}]({chunk.page_url})")
 72 |         md_outputs.append(f"{chunk.contents[:100]}...")
 73 |     if len(md_outputs) == 1:
 74 |         md_outputs = []
 75 |     return md_outputs
 76 | 
 77 | def generative_search_engine_iter(question, index, meta_df, k=2, meta_df_id_col='doc_id'):
 78 |     "Assumes index and df are defined in the global scope"
 79 |     context = qa_iter(question, index, k=k)
 80 |     similar_chunk_df = meta_df[meta_df[meta_df_id_col].isin(context.source_node_ids)]
 81 |     return context.response, st_output_format(similar_chunk_df)
 82 | 
 83 | # use the processed df to build the index
 84 | def get_documents_from_content_section_df(df):
 85 |     ids = []; documents = []
 86 |     for i, text in enumerate(df.contents):
 87 |         doc = Document(text=text, id_=i)
 88 |         documents.append(doc)
 89 |         ids.append(doc.id_)
 90 |     return documents, ids
 91 | 
 92 | @st.cache_resource
 93 | def setup_index():
 94 |     df = run.data.processed_df
 95 |     documents, ids = get_documents_from_content_section_df(df)
 96 |     df['doc_id'] = ids
 97 |     index = VectorStoreIndex(documents)
 98 |     return index, df
 99 | 
100 | index, df = setup_index()
101 | chat_engine = index.as_chat_engine(chat_mode=chat_mode, verbose=True, streaming=True)
102 | 
103 | # Initialize chat history
104 | st.markdown("# Chat history")
105 | if "messages" not in st.session_state:
106 |     st.session_state.messages = []
107 | 
108 | # Display chat messages from history on app rerun
109 | for message in st.session_state.messages:
110 |     with st.chat_message(message["role"]):
111 |         st.markdown(message["content"])
112 | 
113 | # React to user input
114 | if user_prompt := st.chat_input("Hey Metaflower 🌻 what's on your mind?"):
115 | 
116 |     # Display user message in chat message container
117 |     with st.chat_message("user"):
118 |         st.markdown(user_prompt)
119 | 
120 |     # Add user message to chat history
121 |     st.session_state.messages.append({"role": "user", "content": user_prompt})
122 | 
123 |     with st.spinner("Finding context..."):
124 |         subquery_response, md_outputs = generative_search_engine_iter(user_prompt, index, df, K)
125 | 
126 |     # Display assistant response in chat message container
127 |     with st.chat_message("Metaflow assistant"):
128 | 
129 |         message_placeholder = st.empty()
130 | 
131 |         streaming_response = chat_engine.stream_chat(subquery_prompt.format(user_prompt, subquery_response))
132 |         full_response = ""
133 |         for text in streaming_response.response_gen:
134 |             full_response += text
135 |             message_placeholder.markdown(full_response + "▌")
136 | 
137 |         for line in md_outputs:
138 |             st.markdown(line)
139 | 
140 |     # Add assistant response to chat history
141 |     st.session_state.messages.append({"role": "Metaflow assistant", "content": full_response})
142 | 
143 | if st.button("Reset chat engine's memory"):
144 |     chat_engine.reset()


--------------------------------------------------------------------------------
/data/test-data/mf-docs/docs/api/current.md:
--------------------------------------------------------------------------------
  1 | # Current - Operating a run
  2 | 
  3 | <!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! Instead, edit the notebook w/the location & name as this file. -->
  4 | 
  5 | The `current` object is used to inspect and manipulate the currently executing run. It is only available during flow execution, i.e. inside a `FlowSpec` class and functions called from its steps. You can access the object simply by importing it: `from metaflow import current`.
  6 | 
  7 | The attributes available in `current` depend on the decorators assigned to the flow and the step where `current` is used. Attributes that are always available are listed under *Common Attributes* below. Decorator-specific attributes are listed under the decorator name.
  8 | 
  9 | ## Common Attributes
 10 | 
 11 | These attributes are always available in the `current` object.
 12 | 
 13 | 
 14 | <DocSection type="property" name="current.is_running_flow" module="__main__" show_import="False" heading_level="4">
 15 | 
 16 | <Description summary="Returns True if called inside a running Flow, False otherwise.\n\nYou can use this property e.g. inside a library to choose the desired\nbehavior depending on the execution context.\n" />
 17 | <ParamSection name="Returns">
 18 | <Parameter type="bool" desc="True if called inside a run, False otherwise." />
 19 | </ParamSection>
 20 | </DocSection>
 21 | 
 22 | 
 23 | 
 24 | <DocSection type="property" name="current.flow_name" module="__main__" show_import="False" heading_level="4">
 25 | 
 26 | <Description summary="The name of the currently executing flow.\n" />
 27 | <ParamSection name="Returns">
 28 | <Parameter type="str" desc="Flow name." />
 29 | </ParamSection>
 30 | </DocSection>
 31 | 
 32 | 
 33 | 
 34 | <DocSection type="property" name="current.run_id" module="__main__" show_import="False" heading_level="4">
 35 | 
 36 | <Description summary="The run ID of the currently executing run.\n" />
 37 | <ParamSection name="Returns">
 38 | <Parameter type="str" desc="Run ID." />
 39 | </ParamSection>
 40 | </DocSection>
 41 | 
 42 | 
 43 | 
 44 | <DocSection type="property" name="current.step_name" module="__main__" show_import="False" heading_level="4">
 45 | 
 46 | <Description summary="The name of the currently executing step.\n" />
 47 | <ParamSection name="Returns">
 48 | <Parameter type="str" desc="Step name." />
 49 | </ParamSection>
 50 | </DocSection>
 51 | 
 52 | 
 53 | 
 54 | <DocSection type="property" name="current.task_id" module="__main__" show_import="False" heading_level="4">
 55 | 
 56 | <Description summary="The task ID of the currently executing task.\n" />
 57 | <ParamSection name="Returns">
 58 | <Parameter type="str" desc="Task ID." />
 59 | </ParamSection>
 60 | </DocSection>
 61 | 
 62 | 
 63 | 
 64 | <DocSection type="property" name="current.retry_count" module="__main__" show_import="False" heading_level="4">
 65 | 
 66 | <Description summary="The index of the task execution attempt.\n\nThis property returns 0 for the first attempt to execute the task.\nIf the @retry decorator is used and the first attempt fails, this\nproperty returns the number of times the task was attempted prior\nto the current attempt.\n" />
 67 | <ParamSection name="Returns">
 68 | <Parameter type="int" desc="The retry count." />
 69 | </ParamSection>
 70 | </DocSection>
 71 | 
 72 | 
 73 | 
 74 | <DocSection type="property" name="current.origin_run_id" module="__main__" show_import="False" heading_level="4">
 75 | 
 76 | <Description summary="The run ID of the original run this run was resumed from.\n\nThis property returns None for ordinary runs. If the run\nwas started by the resume command, the property returns\nthe ID of the original run.\n\nYou can use this property to detect if the run is resumed\nor not.\n" />
 77 | <ParamSection name="Returns">
 78 | <Parameter type="str" desc="Run ID of the original run." />
 79 | </ParamSection>
 80 | </DocSection>
 81 | 
 82 | 
 83 | 
 84 | <DocSection type="property" name="current.pathspec" module="__main__" show_import="False" heading_level="4">
 85 | 
 86 | <Description summary="Pathspec of the current run, i.e. a unique\nidentifier of the current task. The returned\nstring follows this format:\n```\n{flow_name}/{run_id}/{step_name}/{task_id}\n```\n" />
 87 | <ParamSection name="Returns">
 88 | <Parameter type="str" desc="Pathspec." />
 89 | </ParamSection>
 90 | </DocSection>
 91 | 
 92 | 
 93 | 
 94 | <DocSection type="property" name="current.namespace" module="__main__" show_import="False" heading_level="4">
 95 | 
 96 | <Description summary="The current namespace.\n" />
 97 | <ParamSection name="Returns">
 98 | <Parameter type="str" desc="Namespace." />
 99 | </ParamSection>
100 | </DocSection>
101 | 
102 | 
103 | 
104 | <DocSection type="property" name="current.username" module="__main__" show_import="False" heading_level="4">
105 | 
106 | <Description summary="The name of the user who started the run, if available.\n" />
107 | <ParamSection name="Returns">
108 | <Parameter type="str" desc="User name." />
109 | </ParamSection>
110 | </DocSection>
111 | 
112 | 
113 | ## Decorator-specific attributes
114 | 
115 | These attributes are only available when the decorator is present.
116 | 
117 | ### @project
118 | 
119 | [The @project decorator](/production/coordinating-larger-metaflow-projects) exposes attributes related to the current deployment.
120 | 
121 | 
122 | <DocSection type="property" name="current.project_name" module="__main__" show_import="False" heading_level="4">
123 | 
124 | <Description summary="The name of the project assigned to this flow,\ni.e. `X` in `@project(name=X)`.\n" />
125 | <ParamSection name="Returns">
126 | <Parameter type="str" desc="Project name." />
127 | </ParamSection>
128 | </DocSection>
129 | 
130 | 
131 | 
132 | <DocSection type="property" name="current.project_flow_name" module="__main__" show_import="False" heading_level="4">
133 | 
134 | <Description summary="The flow name prefixed with the current project\nand branch. This name identifies the deployment\non a production scheduler.\n" />
135 | <ParamSection name="Returns">
136 | <Parameter type="str" desc="Flow name prefixed with project information." />
137 | </ParamSection>
138 | </DocSection>
139 | 
140 | 
141 | 
142 | <DocSection type="property" name="current.branch_name" module="__main__" show_import="False" heading_level="4">
143 | 
144 | <Description summary="The current branch, i.e. `X` in\n`--branch=X` set during deployment.\n" />
145 | <ParamSection name="Returns">
146 | <Parameter type="str" desc="Branch name." />
147 | </ParamSection>
148 | </DocSection>
149 | 
150 | 
151 | 
152 | <DocSection type="property" name="current.is_user_branch" module="__main__" show_import="False" heading_level="4">
153 | 
154 | <Description summary="True if the flow is deployed without a\nspecific `--branch` or a `--production` flag.\n" />
155 | <ParamSection name="Returns">
156 | <Parameter type="bool" desc="True if the deployment does not correspond to a specific branch." />
157 | </ParamSection>
158 | </DocSection>
159 | 
160 | 
161 | 
162 | <DocSection type="property" name="current.is_production" module="__main__" show_import="False" heading_level="4">
163 | 
164 | <Description summary="True if the flow is deployed with the `--production`\nflag.\n" />
165 | <ParamSection name="Returns">
166 | <Parameter type="bool" desc="True if the flow is deployed in `--production`." />
167 | </ParamSection>
168 | </DocSection>
169 | 
170 | 
171 | ### @card
172 | 
173 | [The @card decorator](/metaflow/visualizing-results) exposes functions in `current` that allow you to customize
174 | the contents of cards using [card components](/api/cards#Card-components). For an overview of card-related APIs, see [the API reference for cards](/api/cards).
175 | 
176 | 
177 | <DocSection type="method" name="current.card.__getitem__" module="metaflow" show_import="False" heading_level="4" link="https://github.com/Netflix/metaflow/tree/master/__main__.py#L27">
178 | <SigArgSection>
179 | <SigArg name="self" />
180 | </SigArgSection>
181 | <Description summary="Choose a specific card for manipulation." extended_summary="When multiple @card decorators are present, you can add an\n`ID` to distinguish between them, `@card(id=ID)`. This allows you\nto add components to a specific card like this:\n```\ncurrent.card[ID].append(component)\n```" />
182 | <ParamSection name="Parameters">
183 | 	<Parameter name="key" type="str" desc="Card ID." />
184 | </ParamSection>
185 | <ParamSection name="Returns">
186 | 	<Parameter type="CardComponentCollector" desc="An object with `append` and `extend` calls which allow you to\nadd components to the chosen card." />
187 | </ParamSection>
188 | </DocSection>
189 | 
190 | 
191 | 
192 | <DocSection type="method" name="current.card.__setitem__" module="metaflow" show_import="False" heading_level="4" link="https://github.com/Netflix/metaflow/tree/master/__main__.py#L29">
193 | <SigArgSection>
194 | <SigArg name="self" />
195 | </SigArgSection>
196 | <Description summary="Specify components of the chosen card." extended_summary="Instead of adding components to a card individually with `current.card[ID].append(component)`,\nuse this method to assign a list of components to a card, replacing the existing components:\n```\ncurrent.card[ID] = [FirstComponent, SecondComponent]\n```" />
197 | <ParamSection name="Parameters">
198 | 	<Parameter name="key: str" desc="Card ID." />
199 | 	<Parameter name="value: List[CardComponent]" desc="List of card components to assign to this card." />
200 | </ParamSection>
201 | </DocSection>
202 | 
203 | 
204 | 
205 | <DocSection type="method" name="current.card.append" module="metaflow" show_import="False" heading_level="4" link="https://github.com/Netflix/metaflow/tree/master/__main__.py#L31">
206 | <SigArgSection>
207 | <SigArg name="self" />
208 | </SigArgSection>
209 | <Description summary="Appends a component to the current card." />
210 | <ParamSection name="Parameters">
211 | 	<Parameter name="component" type="CardComponent" desc="Card component to add to this card." />
212 | </ParamSection>
213 | </DocSection>
214 | 
215 | 
216 | 
217 | <DocSection type="method" name="current.card.extend" module="metaflow" show_import="False" heading_level="4" link="https://github.com/Netflix/metaflow/tree/master/__main__.py#L33">
218 | <SigArgSection>
219 | <SigArg name="self" />
220 | </SigArgSection>
221 | <Description summary="Appends many components to the current card." />
222 | <ParamSection name="Parameters">
223 | 	<Parameter name="component" type="Iterator[CardComponent]" desc="Card components to add to this card." />
224 | </ParamSection>
225 | </DocSection>
226 | 
227 | 


--------------------------------------------------------------------------------
/data/test-data/mf-docs/docs/introduction/why-metaflow.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | pagination_prev: index
  3 | ---
  4 | 
  5 | import Caption from "/src/components/Caption";
  6 | 
  7 | # Why Metaflow
  8 | 
  9 | ### 1. Modern businesses are eager to utilize data science and ML
 10 | 
 11 | In the past, data scientists and ML engineers had to rely
 12 | on a medley of point solutions and custom systems to build ML and data science
 13 | applications.
 14 | 
 15 | ![Many data science opportunities](/assets/mf-intro-01.png)
 16 | 
 17 | ---
 18 | 
 19 | ### 2. What is common in DS/ML applications?
 20 | 
 21 | Applications can be built quicker and more robustly if they stand on a common, human-friendly foundation. But what should the foundation cover?
 22 | 
 23 | ![A solid foundation for all use cases](/assets/mf-intro-02.png)
 24 | 
 25 | ---
 26 | 
 27 | ### 3. All DS/ML applications use data
 28 | 
 29 | **Data** may come in different shapes and sizes and may be loaded from various data stores. However, no matter what data is used, accessing and processing it shouldn't be too cumbersome.
 30 | 
 31 | ![Data](/assets/mf-intro-03.png)
 32 | 
 33 | ---
 34 | 
 35 | ### 4. DS/ML applications need to perform computation
 36 | 
 37 | Some applications require a tremendous amount of compute power - think computer vision - while some do with less.
 38 | Regardless of the scale, all applications need to perform **computation** reliably. Thanks to cloud computing, 
 39 | data scientists and ML engineers should be able to utilize elastic compute resources without friction.
 40 | 
 41 | ![Compute](/assets/mf-intro-04.png)
 42 | 
 43 | ---
 44 | 
 45 | ### 5. DS/ML applications consists of multiple interconnected parts
 46 | 
 47 | Consider an application that loads data, transforms it, trains a bunch of models, chooses the best performing one, runs
 48 | inference, and writes the results to a database. Multi-steps workflows like this are a norm in ML. **A workflow
 49 | orchestrator** is needed to make sure all steps get executed in order, on time.
 50 | 
 51 | ![Orchestration](/assets/mf-intro-05.png)
 52 | 
 53 | ---
 54 | 
 55 | ### 6. DS/ML applications evolve over time incrementally
 56 | 
 57 | Rarely a real-world application is built and deployed only once. Instead, a typical application is built gradually,
 58 | through contributions by many people. The project needs to be tracked, organized, and **versioned**, which enables systematic and
 59 | continuous improvement over time.
 60 | 
 61 | ![Versioning](/assets/mf-intro-06.png)
 62 | 
 63 | ---
 64 | 
 65 | ### 7. DS/ML applications produce business value in various ways
 66 | 
 67 | To produce real business value, DS/ML applications can't live in a walled garden. They must be integrated with the surrounding systems seamlessly: Some applications enhance data in a database, some power internal dashboards or microservices, whereas some power user-facing products. There are many such ways to **deploy** ML in production. The more valuable the application, the more carefully it needs to be operated and monitored as well.
 68 | 
 69 | ![Deployment](/assets/mf-intro-07.png)
 70 | 
 71 | ---
 72 | 
 73 | ### 8. DS/ML applications should leverage the best tools available
 74 | 
 75 | For many data scientists and ML engineers, the most rewarding part of the project is **modeling**. Using their domain
 76 | knowledge and expertise, the modeler should be able to choose the best tool for the job amongst off-the-shelf libraries, such as PyTorch, XGBoost, Scikit Learn, and many others. Or, if necessary, they should be able to use a wholly custom approach.
 77 | 
 78 | ![Modeling](/assets/mf-intro-08.png)
 79 | 
 80 | ---
 81 | 
 82 | ### 9. Metaflow covers the full stack of DS/ML infrastructure
 83 | 
 84 | Metaflow was originally created at Netflix, motivated by the realization that data scientists and ML engineers need
 85 | help with all these concerns: Any gaps or friction in the stack can slow down the project drastically. Thanks to a 
 86 | common foundation
 87 | provided by Metaflow, data scientists can iterate on ideas quickly and 
 88 | deploy them confidently by relying on a well-defined architecture and best practices, shared by everyone in the team.
 89 | 
 90 | ![Full-stack Metaflow](/assets/mf-intro-09.png)
 91 | 
 92 | ---
 93 | 
 94 | ### 10. Metaflow takes care of the plumbing, so you can focus on the fun parts
 95 | 
 96 | Metaflow provides a robust and user-friendly foundation for a wide spectrum of data-intensive applications, including most data science and ML use cases. Data scientists and ML engineers who know the basics of Python can build their own applications, models, and policies on top of it, while Metaflow takes care of the low-level infrastructure: data, compute, orchestration, and versioning.
 97 | 
 98 | ![Full stack triangles](/assets/mf-intro-10.png)
 99 | 
100 | ---
101 | 
102 | ### 11. Metaflow relies on systems that engineers know and trust
103 | 
104 | Metaflow was designed at Netflix to serve the needs of business-critical ML/DS applications. It relies on proven and scalable infrastructure which works for small and large organizations alike. Metaflow integrates with all the
105 | top clouds as well as with Kubernetes and systems around them in a responsible manner. It respects the security and other
106 | policies of your company, making engineering teams happy too.
107 | 
108 | ![Existing infrastructure](/assets/mf-intro-11.png)
109 | 
110 | ---
111 | 
112 | ### 12. Metaflow is used by hundreds of innovative companies
113 | 
114 | Today, Metaflow powers thousands of ML/DS applications at innovative companies such as 
115 | [Netflix](https://netflixtechblog.com/supporting-content-decision-makers-with-machine-learning-995b7b76006f), 
116 | [CNN](https://medium.com/cnn-digital/accelerating-ml-within-cnn-983f6b7bd2eb), 
117 | [SAP](https://blogs.sap.com/2022/04/20/train-your-model-in-sap-ai-core-using-the-metaflow-argo-plugin/),
118 | [23andMe](https://medium.com/23andme-engineering/machine-learning-eeee69d40736), 
119 | [Realtor.com](https://medium.com/realtor-com-innovation-blog/improving-data-science-processes-to-speed-innovation-at-realtor-com-b6b90fa530dc), 
120 | [REA](https://www.rea-group.com/about-us/news-and-insights/blog/accelerating-experimentation-with-mlops/),
121 | [Coveo](https://outerbounds.com/blog/dataops-mlops-reasonable-organizations/),
122 | [Latana](https://aws.amazon.com/blogs/startups/brand-tracking-with-bayesian-statistics-and-aws-batch/),
123 | and
124 | hundreds of others across industries. Commercial support for Metaflow is provided by [Outerbounds](https://outerbounds.com).
125 | To hear first-hand experiences from these companies and many others, [join the Metaflow Slack](http://slack.outerbounds.co).
126 | 
127 | 


--------------------------------------------------------------------------------
/data/test-data/ob/blog/llm-tuning-metaflow.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Fine-tuning a Large Language Model using Metaflow, featuring LLaMA and LoRA"
  3 | date: "2023-06-21T12:41:26"
  4 | tags: ["AI", "Foundation Models", "LLM", "NLP"]
  5 | keywords: ["AI", "Foundation Models", "LLM", "NLP"]
  6 | categories: ["Foundation Models"]
  7 | authors: [federico, obteam]
  8 | description: "A workflow template built with Metaflow for fine-tuning LLMs for custom use cases."
  9 | image: "/blog/cover/llm-tuning-metaflow.png"
 10 | ogImage: "/blog/og/llm-tuning-metaflow.png"
 11 | slug: "llm-tuning-metaflow"
 12 | ---
 13 | 
 14 | ---
 15 | 
 16 | *This post demonstrates how you can use Metaflow to fine-tune a production-grade custom large language model (LLM) based on instruction tuning. We provide an open-source, fine-tuning workflow template that allows you to run models on local or cloud-based GPUs and connect the results to surrounding production systems using the complete toolchain of Metaflow.*
 17 | 
 18 | [Federico](https://federicobianchi.io) is a post-doctoral researcher at Stanford University, working on NLP and Large Language Models. He frequently releases research projects as open-source tools that have collectively gathered thousands of [GitHub stars](https://github.com/vinid).
 19 | 
 20 | ---
 21 | 
 22 | Motivation
 23 | ----------
 24 | Unsurprisingly, as of June 2023, building production systems that leverage custom large language models takes a lot of work and money. As usual in any novel technical field, one faces a diverse, quickly moving landscape of immature tooling, missing documentation, and vibrant but chaotic communities. Developers face complex and varying dependencies of LLMs, spanning hardware drivers and bleeding-edge software frameworks, which can take hours and days to figure out for any given use case.
 25 | 
 26 | Some astute companies may decide to wait a few years for the field to mature. However, many companies need or want to start innovating faster, and mature tools like [Metaflow](https://metaflow.org/) can help. Metaflow helps you design robust workflows to support any data-intensive application, allowing you to amortize the fixed cost of setting up systems - such as LLM experimentation pipelines - over the project's life. Using Metaflow, teams can set up appropriate human and technical workflows for LLM development and deployment, allowing them to focus on the business's specific needs.
 27 | 
 28 | Metaflow has been supporting serious ML use cases like this for years. The foundational needs of LLM development don’t differ much from other earlier models (besides scale!), so Metaflow is a natural fit for building systems around custom LLMs.
 29 | 
 30 | Foundational infrastructure for LLMs
 31 | ------------------------------------
 32 | If you just want to run the code and do not care about owning the supply chain or deploying the results to a production system, [Colab](https://colab.google/) and similar GPU-powered-notebook-as-a-service providers are a convenient way to access prototyping the needed infrastructure fast. Alternatively, projects like [GPT4All](https://gpt4all.io/index.html) make running LLM inference on a laptop without a GPU straightforward. 
 33 | 
 34 | However, as we discussed in [part one](/blog/custom-llm-tuning) of this series, there are cases where organizations that want to move beyond prototyping in notebooks and consider using LLMs in production need more substantial ML tooling and infrastructure: You need to consider how to connect models to surrounding systems, run training and inference and all supporting steps on a scalable compute platform, iterate and track models, code and data across versions, and have both technical and human workflows in place for continuous improvement and experiments. All this in business-critical systems with tight SLA, security, and compliance requirements.
 35 | 
 36 | How Metaflow helps
 37 | ------------------
 38 | This article shows how to use Metaflow to instruction-tune a custom LLM, accompanied by [the full stack of production-grade ML tooling](https://outerbounds.com/docs/infra-stack/) that Metaflow provides. We use a few lines of Metaflow code to access and monitor GPUs in CoreWeave’s cloud, making it easy to produce, track, and report results in shareable [Metaflow cards](https://docs.metaflow.org/api/cards) when running off-the-shelf open-source LLM code.
 39 | 
 40 | If you are unfamiliar with the features of Metaflow, we encourage you to look at the [Metaflow documentation](https://docs.metaflow.org). Crucially, all the functionality Metaflow provides can be helpful when developing and deploying real-world systems powered by LLMs.
 41 | 
 42 | For instance, Metaflow enables us to explore different model sizes and variants while quickly tuning infrastructure requirements and scheduling runs by changing single values in Python code. Switching between models helps us explore rapidly without unnecessarily wasting money on under-utilized GPUs. 
 43 | 
 44 | ![](../static/assets/huggingface-instruction-tuning.png)
 45 | <Caption> Top left: The lineage of the Alpaca-LoRA model used as a default in this post; Bottom: Steps in the Metaflow workflow; Top right: The output of the fine-tuning step is model checkpoints in HuggingFace  </Caption>
 46 | 
 47 | As Metaflow runs any Python code, you can easily leverage state-of-the-art models and APIs from your favorite modeling framework. For instance, In the example workflow, we push checkpoints to the HuggingFace hub with a few lines of code. In our example, we followed the [HuggingFace documentation](https://huggingface.co/docs/transformers/main_classes/trainer#checkpoints) to save all the checkpoints to a local folder, using a specific pattern that is checkpoint-xxx/ where xxx is the number of steps. 
 48 | 
 49 | Using HuggingFace APIs with Metaflow makes it easy to create a model repo for each checkpoint, train on whatever hardware you need, and push the results back to the hub to use in an application and for downstream testing. Doing this and sharing incremental model checkpoints on teams can also hugely reduce costs. People need not recompute the same model training results and can instead readily pick up training from where a colleague left off.
 50 | 
 51 | Instruction tuning with Metaflow
 52 | --------------------------------
 53 | For example, let’s fine-tune a recent LLaMA variant, taking an already trained LLaMA language model and training it further on an instruction tuning dataset. The learning task is still language modeling, but the training dataset contains specific instruction-following examples.
 54 | 
 55 | In this case, the model learns to do language modeling relevant to the instructions in the Alpaca dataset. You can imagine using any dataset, such as how the Databricks team created a custom instruction-tuning [dataset](https://huggingface.co/datasets/databricks/databricks-dolly-15k) in [Dolly v2](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm). You can find additional information about Metaflow and large language models in our [blog post on tuning Dolly](https://outerbounds.com/blog/train-dolly-metaflow/).
 56 | 
 57 | ### Show me the code
 58 | The core concepts of this workflow template are the following:
 59 | 1. HuggingFace to access the upstream models and to checkpoint our fine-tuned models,
 60 | 2. CoreWeave to access GPU resources, and
 61 | 3. Metaflow to structure the Python code, run it on the GPUs (carefully monitoring resource utilization with a Metaflow card), and track versions of code and models.
 62 | 
 63 | ![](../static/assets/coreweave-instruction-tuning.png)
 64 | 
 65 | The following code shows how to call the main fine-tuning logic from the [Alpaca LoRA codebase](https://github.com/tloen/alpaca-lora). 
 66 | 
 67 | <CustomAdmonition title="Instruction tuning with Metaflow" link="https://github.com/outerbounds/metaflow-instruction-tuning" color="#F9F7F5" image="/icons/common/github.png">
 68 |     Use PyTorch, HuggingFace and Metaflow to build a workflow for fine-tuning LLMs.
 69 | </CustomAdmonition>
 70 | 
 71 | ```python
 72 | # Contents of the flow.py file in the repository linked above.
 73 | 
 74 | from metaflow import FlowSpec, step, Parameter, resources, environment
 75 | from mixins import HuggingFaceLora, N_GPU, visible_devices
 76 | from custom_decorators import pip, gpu_profile
 77 | import os
 78 | 
 79 | class LlamaInstructionTuning(FlowSpec, HuggingFaceLora):
 80 | 
 81 |     push_checkpoints = Parameter("-push", help="push checkpoints on huggingface", default=False, type=bool)
 82 | 
 83 |     @step
 84 |     def start(self):
 85 |         self.next(self.finetune)
 86 | 
 87 |     @environment(vars={
 88 |         "CUDA_VISIBLE_DEVICES": visible_devices, 
 89 |         "WORLD_SIZE": N_GPU,
 90 |         "HUGGINGFACE_TOKEN": os.environ["HUGGINGFACE_TOKEN"],
 91 |         "HF_ORGANIZATION": os.environ["HF_ORGANIZATION"]
 92 |     })
 93 |     @gpu_profile(interval=1)
 94 |     @pip(file="requirements.txt")
 95 |     @resources(gpu=N_GPU, cpu=16, memory=128000) # tested with A100 and A6000 GPU.
 96 |     @step
 97 |     def finetune(self):
 98 |         self.run()
 99 |         if self.push_checkpoints:
100 |             self.upload_to_huggingface()
101 |         self.next(self.end)
102 | 
103 |     @step
104 |     def end(self):
105 |         pass
106 | 
107 | if __name__ == '__main__':
108 |     LlamaInstructionTuning()
109 | ```
110 | 
111 | [Metaflow](https://metaflow.org/) provides [a scaffolding for data science workflows](https://docs.metaflow.org/metaflow/introduction), all written in Python. In this case, the flow centers around the `finetune` step, where we use the [multiple inheritance pattern](https://realpython.com/lessons/multiple-inheritance-python/) to modularize the workflow, separating the Alpaca LoRA code that makes HuggingFace API calls from the Metaflow code that organizes the workflow.
112 | 
113 | Specifically, in the `HuggingFaceLora` class, we define the logic to run the open-source LLM fine-tuning code in a [distributed data-parallel](https://pytorch.org/docs/stable/notes/ddp.html?highlight=distributed+data+parallel) manner using the [torchrun](https://pytorch.org/docs/stable/elastic/run.html) API. The LLM code is run inside a Metaflow task, and a data scientist can add single lines of Python code to layer complex functionality on each task, [like requesting precise GPU and memory resources on the cloud instance that runs the job](https://docs.metaflow.org/scaling/remote-tasks/introduction) and monitoring the resources to ensure they are appropriately sized. 
114 | 
115 | The key idea for data science teams is that once they have access to a Metaflow deployment, a data scientist who can feasibly write Python code can [write, run, and deploy production-grade workflows in a single work session](https://docs.metaflow.org/introduction/what-is-metaflow). Metaflow [persists and tracks all artifacts automatically](https://docs.metaflow.org/metaflow/basics#artifacts), [exposes them to notebooks](https://docs.metaflow.org/metaflow/client) and other programmatic use cases, and shows them on a UI. Once everything works locally, the workflow can be [deployed to production, connecting to other workflows both upstream and downstream](https://docs.metaflow.org/production/introduction), with a single click.
116 | 
117 | If you are curious to test all these features in action, you can [do it easily in the Metaflow sandbox](https://outerbounds.com/sandbox). 
118 | 
119 | Models
120 | ------
121 | We fine-tuned four of the recent LLaMA models on the same dataset with a fixed computing budget for each model; we used [Low-Rank Adaptation](https://arxiv.org/abs/2106.09685), making use of the recent [Alpaca LoRA repository](https://github.com/tloen/alpaca-lora). The models we fine-tuned are the 7B, 13B, 33B, and 65B parameters models, with the idea that larger models should provide better performance and answers. We saved multiple checkpoints for each model and chose the best one by validation loss after training. 
122 | 
123 | We chose the Alpaca dataset - built by generating instructions from GPT-3. We use a [ version](https://huggingface.co/datasets/yahma/alpaca-cleaned) of the dataset in which some errors are corrected. The dataset contains 50K instructions. You can find more details about how we used Metaflow in the practical considerations section.
124 | 
125 | ### Quantitative evaluation
126 | We wanted to assess if our fine-tuned models are still performing as expected after fine-tuning. While there is still active discussion about how to evaluate instruction and chat models (see the [ChatBot Arena](https://lmsys.org/blog/2023-05-03-arena/), for example), we decided to evaluate our models against standard LLM benchmarks as a more customized evaluation approach is beyond the scope of this post. Specifically, we make use of the [LM-Harness](https://github.com/EleutherAI/lm-evaluation-harness) package developed by EleutherAI to evaluate our fine-tuned models on these different tasks in a zero-shot manner. We chose three question-answering benchmarks: [PIQA](https://huggingface.co/datasets/piqa) and [BoolQ](https://huggingface.co/datasets/boolq), and [OpenBookQA](https://huggingface.co/datasets/openbookqa). 
127 | 
128 | Here is an example from PIQA (the other datasets are similar):
129 | 
130 | > “To separate egg whites from the yolk using a water bottle, you should…”
131 | 
132 | The dataset has two possible answers: 
133 | 
134 | > “Squeeze the water bottle and press it against the yolk. Release, which creates suction and lifts the yolk.”
135 | 
136 | > “Place the water bottle and press it against the yolk. Keep pushing, which creates suction and lifts the yolk.” 
137 | 
138 | Given that this is a multiple choice question task, [LM-Harness](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md#multiple-choice-format) runs the evaluation by comparing the likelihood of the label words.
139 | 
140 | So, how well do these models work? Let’s look at some results (the evaluation metric here is accuracy).
141 | 
142 | <div style={{alignItems: 'center', justifyContent: 'center', display: 'flex'}}>
143 | <div style={{textAlign: 'center'}}>
144 | 
145 | | Model | PIQA | BOOLQ | OPENBOOKQA |
146 | | ----- | ---- | ----- | ---------- |
147 | | LLAMA 7B | 0.7916 | 0.7511 | 0.4440 |
148 | | ALPACA 7B | 0.8030  | 0.7835  | 0.4420 |
149 | | LLAMA 13B | 0.8003 | 0.7798 | 0.4480 |
150 | | ALPACA 13B | 0.8030 | 0.8150 | 0.4620 |
151 | 
152 | </div>
153 | </div>
154 | 
155 | These results seem coherent with what was expected based on various leaderboards (see [LLaMA results](https://paperswithcode.com/paper/llama-open-and-efficient-foundation-language-1), for example) and papers. There is not a notably high increase in performance, as we expected. 13B parameters models do slightly better than their 7B counterparts.  We do not expect a substantial increase in performance with respect to the original LLaMA models; instead, we would like to maintain similar performances on these benchmarks. Instruction tuning should give us models that follow instructions better and do not perform worse on general benchmarks. Our evaluation shows we have nicely performing models on the latter, but do they follow instructions?
156 | 
157 | ### Qualitative evaluation
158 | When working with LLMs, being mindful that the research field is new and changing rapidly is essential. Therefore, when considering using this technology in a production environment, we need to evaluate qualitatively and measure performance against quantitative benchmarks, which may need significant iteration and revision for assessing performance in a way that makes sense in any given production context.
159 | 
160 | Even with the small training dataset we used and only three epochs of training, our models already see exciting results. The bigger the model is, the better (and more valuable) its generations become.
161 | 
162 | <div style={{alignItems: 'center', justifyContent: 'center', display: 'flex'}}>
163 | 
164 | | Instruction | 7B | 30B |
165 | | ----------- | -- | --- |
166 | | **Write a poem about capybaras** | Capybaras, the largest rodent in the world, <br></br><br></br> With their shaggy coats and <span style={{color: 'red'}}> bushy tails</span>, <br></br><br></br> They roam the grasslands of South America, <br></br><br></br> In search of food and water to quench their thirst. <br></br><br></br> Capybaras are herbivores, and they love to graze, <br></br><br></br> Eating grasses, fruits, and vegetables, too, <br></br><br></br> They are social creatures, living in herds, <br></br><br></br> And they're known for their playful and friendly deeds. | A Capybara's Life <br></br><br></br> A capybara's life is filled with joy, <br></br> As they frolic in the grass and play in the sun. <br></br><br></br> They're the largest rodents in the world, <br></br><br></br> <span style={{color: 'green'}}>With bodies built for swimming and for fun.</span> <br></br><br></br> They're social creatures, always in a group,<br></br><br></br> Enjoying each other's company, without a flop. <br></br><br></br> They love to eat grass and aquatic plants,<br></br><br></br> A diet that keeps them healthy and well-fed. |
167 | | **Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering.** (From Flan) | No, it is not possible for Geoffrey Hinton to have a conversation with George Washington. This is because George Washington is a historical figure who lived from 1732 to 1799, while Geoffrey Hinton is a living person who was born in <span style={{color: 'red'}}>1957</span>. Time travel is not possible, so it is not possible for two people from different eras to have a conversation with each other. | No, it is not possible for Geoffrey Hinton to have a conversation with George Washington. Geoffrey Hinton is a living person who was born in 1947, while George Washington died in 1799. It is not possible for a living person to have a conversation with someone who has been dead for more than 200 years. |
168 | | **Classify this document in one of these categories, "sport", "politics", or "technology".** <br></br><br></br> **“The LLaMA models are a family of Natural Language Processing LLMs that can be applied to many different tasks.”** | technology | technology |
169 | | **Do aliens invest in stocks?** | No, aliens do not invest in stocks. Stock investing is a human activity that involves buying and selling shares of publicly traded companies in order to profit from changes in their share prices. Aliens do not have access to the same financial markets and systems that humans do, so they would not be able to participate in stock investing. <br></br><br></br> <span style={{color: 'orange'}}>However, it is possible that aliens could invest in other types of financial instruments</span>, such as bonds, mutual funds, or real estate, if they had access to the appropriate financial markets and systems. | I'm sorry, but as an AI language model, I don't have access to any information about aliens or their investment habits. It's possible that some alien civilizations may invest in stocks, but it's also possible that they may have different investment strategies or may not invest in stocks at all. It's difficult for me to give a definitive answer to this question without more information or data. It's possible that some alien civilizations may invest in stocks, but it's also possible that they may have different investment strategies. |
170 | | **To separate egg whites from the yolk using a water bottle, you should…** (from PIQA) | To separate egg whites from the yolk using a water bottle, you should follow these steps: <br></br><br></br> 1. Crack the egg into a small bowl and separate the yolk from the egg white. <br></br> <span style={{color: 'red'}}>2. Fill the water bottle with cold water. </span><br></br> 3. Place the egg white into the water bottle. <br></br> <span style={{color: 'red'}}>4. Shake the water bottle vigorously to separate the egg white from the yolk. </span><br></br> 5. Remove the egg white from the water bottle and discard the yolk. <br></br> 6. Repeat the process for the remaining eggs. | To separate egg whites from the yolk using a water bottle, you should follow these steps: <br></br><br></br> 1. Start by cracking an egg into a small bowl or cup. <br></br> 2. Hold the water bottle upside down over the bowl or cup. <br></br> 3. Slowly squeeze the water bottle, allowing the <span style={{color: 'red'}}>water to flow</span> into the bowl or cup. <br></br> 4. As the water flows into the bowl or cup, it will push the egg yolk towards the bottom of the bowl or cup. |
171 | | **Best way to pierce ears** (from PIQA)| The best way to pierce ears is to use a sterilized, hygienic piercing gun or needle. It is important to follow the instructions provided by the piercing professional and to clean the area thoroughly before and after the piercing. It is also important to use a high-quality, non-allergenic jewelry and to take proper care of the piercing by cleaning it regularly and avoiding contact with water while the piercing is healing. It is recommended to wait at least 6-8 weeks for the piercing to heal completely before changing the jewelry. | <span style={{color: 'green'}}>The safest and most effective way to pierce your ears is to have it done by a professional piercer at a reputable piercing studio.</span> A professional piercer will use sterilized equipment and follow strict hygienic procedures to minimize the risk of infection or other complications. They will also be able to advise you on the best placement and type of jewelry for your piercing, as well as provide aftercare instructions to ensure proper healing. |
172 | | **What's the best way to drink water if you are juggling three balls?** | If you are juggling three balls, the best way to drink water is to use a straw. This will allow you to keep your hands free to continue juggling, while also ensuring that you don't spill any water on yourself or your juggling equipment. Additionally, it's important to drink small sips of water, rather than gulping down large amounts at once. This will help you to stay hydrated without interfering with your juggling routine. | <span style={{color: 'green'}}>If you are juggling three balls, the best way to drink water would be to take a break from juggling, set the balls down on a safe surface, and take a sip of water from a cup or bottle.</span> Juggling and drinking at the same time can be difficult and may result in spilling water or dropping the balls. It's important to stay hydrated while juggling, but it's best to do so in a way that doesn't interfere with your juggling performance. |
173 | 
174 | </div>
175 | 
176 | These results show multiple things: first off, the models are good at replying to questions and instructions. They all come up with coherent text that appears correct. The 7B model is more prone to generating incorrect stuff, such as getting the wrong year of birth of someone or suggesting something that might be unsafe. We have color coded some replies to highlight possible good (green), wrong (red), or unclear (orange) answers.
177 | 
178 | Summary
179 | -------
180 | You can use the above Metaflow workflow as a generic template for fine-tuning models available on HuggingFace. Besides the Alpaca instruction dataset we used to fine-tune various sizes of LLMs, you can apply the same approach to fine-tune models with your instructions to optimize models for your use cases. We are happy to help you get started with this template and Metaflow in general. Join us and thousands of other ML engineers, AI enthusiasts, and data scientists on  [Metaflow Community Slack](http://slack.outerbounds.co/)! 
181 | 
182 | Acknowledgments
183 | ---------------
184 | We would like to thank Mert Yuksekgonul, Mirac Suzgun, Patrick John Chia, Silvia Terragni, Giuseppe Attanasio, and Jacopo Tagliabue for feedback and suggestions on a previous version of this blog post.
185 | 
186 | 


--------------------------------------------------------------------------------
/data/test-data/ob/blog/metaflow-fast-data.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Fast Data: Loading Tables From S3 At Lightning Speed"
  3 | date: "2023-05-05T20:13:16"
  4 | tags: ["data", "MLOps"]
  5 | keywords: ["data", "MLOps"]
  6 | categories: ["Data", "Best Practices"]
  7 | authors: [obteam]
  8 | description: "Load and process tabular data at lightning speed using Apache Arrow and Metaflow."
  9 | slug: "metaflow-fast-data"
 10 | image: "/blog/cover/metaflow-fast-data.png"
 11 | ogImage: "/blog/og/metaflow-fast-data.png"
 12 | ---
 13 | 
 14 | ---
 15 | 
 16 | *This article answers a common question: How can I load tabular data quickly from S3 into a Metaflow workflow and process it efficiently? Thanks to Metaflow’s straightforward support for vertical scalability, its high-throughput S3 client, and high-performance tooling powered by Apache Arrow, you can handle surprisingly large amounts of data, surprisingly fast.*
 17 | 
 18 | *You can execute all benchmarks and examples with [the code in this repository](https://github.com/outerbounds/fast-data-blog/).*
 19 | 
 20 | ---
 21 | 
 22 | Ever since the early days of Metaflow at Netflix, we have been asking a fundamental question from data scientists and other users of Metaflow: What is the greatest source of friction in your day to day work? Maybe unsurprisingly, the majority of answers relate to data: Finding the right data, accessing data, transforming data, and utilizing data in various ways.
 23 | 
 24 | In contrast to traditional software, which has a limited surface area with the outside world, data exposes ML/AI-powered applications to constantly changing real-world entropy and complexity. It would be unreasonable to expect that we can address the diverse use cases with a single nifty solution. Needs differ greatly, say, between computer vision, NLP, large tabular data, and small datasets.
 25 | 
 26 | We can support a wide variety of use cases by working well with a broad set of open-source data tools and efficient infrastructure. This approach is illustrated by our view of [the full stack of ML infrastructure](https://outerbounds.com/docs/infra-stack/) which acknowledges the need of providing plenty of support for data:
 27 | 
 28 | ![](_metaflow_fast_data/fast-data-stack.png)
 29 | 
 30 | Metaflow doesn’t provide magical new abstractions or paradigms for processing data, but it provides foundational, human-friendly infrastructure that works smoothly with the tools of your choice. Data scientists certainly care about
 31 | data but data tooling should just work without hassle. Often simpler is better.
 32 | 
 33 | This article focuses on a common use case of data: tabular, structured datasets. This use case is particularly interesting, as tooling for it has evolved rapidly over the past few years.
 34 | 
 35 | 
 36 | A fast path for tabular data
 37 | ----------------------------
 38 | 
 39 | All modern data warehousing solutions either store data natively as Parquet-encoded files (like various versions of [the cloud-based data lake](https://towardsdatascience.com/an-aws-data-lake-with-s3-explained-c67c5f161db3) e.g. powered by [AWS Athena](https://aws.amazon.com/athena/) or [Spark](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark.html)), or they allow exporting/unloading data as Parquet files (e.g. [Snowflake](https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet) or [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html)).
 40 | 
 41 | Conveniently, these systems allow you to create materialized views which can be processed outside the warehouse. You can use SQL and [the Create-Table-As-Select (CTAS) pattern](https://docs.aws.amazon.com/athena/latest/ug/ctas.html) to query even petabytes of data, extracting the results for downstream processing in Metaflow.
 42 | 
 43 | Once you have Parquet files in S3, you can hit the fast path:
 44 | 
 45 | ![](_metaflow_fast_data/arrow-diagram.png)
 46 | 
 47 | 1. You can load data from S3 directly to memory *very quickly*, at tens of gigabits per second or more, using [Metaflow’s optimized S3 client, `metaflow.S3`](https://docs.metaflow.org/scaling/data).
 48 | 
 49 | 2. Once in memory, Parquet data can be decoded efficiently using [Apache Arrow](https://arrow.apache.org/).
 50 | 
 51 | 3. The in-memory tables produced by Arrow are interoperable with various modern data tools, so you can use the data in various ways without making additional copies, which speeds up processing and avoids unnecessary memory overhead.
 52 | 
 53 | This pattern is becoming increasingly popular, thanks to the maturation of Apache Arrow itself and versatile tools around it. The excitement is further motivated by the realization that a single medium-size EC2 instance can handle an amount of data that used to require a Hadoop or Spark cluster just a decade ago - this time with much fewer operational headaches, and at a much lower cost.
 54 | 
 55 | The pattern is still new, so not everyone is fully aware of how well and fast it works - under suitable conditions. To showcase this, we first focus on the left side of the diagram, addressing a common myth that loading data from S3 is slow. After this, we focus on the right side, demonstrating how to use the data with various high-performance tools efficiently.
 56 | 
 57 | S3 is fast (when used correctly)
 58 | --------------------------------
 59 | 
 60 | Occasionally, we are asked how to cache data from S3 on local disk, instance volume, or a distributed file system like EFS to make loading and processing data faster. Counterintuitively, loading data from S3 can be much faster than local disk, so there's no need to add extra layers of complexity.
 61 | 
 62 | The question likely stems from first-hand experiences of S3 being slow, which can easily happen if you don’t have a setup that leverages its strengths. This benchmark illustrates the point:
 63 | 
 64 | ![](_metaflow_fast_data/load_s3_ec2_chart.png)
 65 | 
 66 | The bars refer to a Metaflow task that uses `metaflow.S3` to download the dataset, running on EC2 with AWS Batch [using the @batch decorator](https://docs.metaflow.org/scaling/remote-tasks/aws-batch), comparing four different configurations:
 67 | 
 68 | - **Small (disk)**: 8GB RAM, 2 CPU cores, a c4.2xlarge instance with EBS.
 69 | - **Medium (disk)**: 8GB RAM, 8 CPU cores, a c4.8xlarge instance with EBS.
 70 | - **Medium (tmpfs)**: 30GB RAM, 8 CPU cores, a c4.8xlarge utilizing Metaflow’s new `tmpfs` feature.
 71 | - **Large (tmpfs)**: 200GB RAM, 32 CPU cores, an m5n.24xlarge utilizing Metaflow’s new `tmpfs` feature.
 72 | 
 73 | There is a 35x fold speed difference between the slowest and the fastest configuration! Understandably, if your tasks ran under the first configuration, you would conclude that S3 is slow.
 74 | 
 75 | ### Maximizing the S3 performance
 76 | 
 77 | A key observation is that the download throughput of S3 is a combination of multiple factors. In order to achieve the maximum throughput, pay attention to the following dimensions:
 78 | 
 79 | - **Same region**: Make sure the EC2 instances hosting the tasks are located in the same region as the S3 bucket you are loading data from.
 80 | - **File layout**: You need to download multiple files in parallel using e.g. [metaflow.S3.get_many](https://docs.metaflow.org/api/S3#S3.get_many). The files should be around 0.1-1GB each. Fortunately, it is easy to produce partitioned outputs like this with many query engines.
 81 | - **Instance size**: Larger EC2 instances boost higher number of CPU cores, network throughput, and memory. When using e.g. `@batch` with Metaflow, instances autoscale based on demand, so faster processing times can lead to lower costs, despite the higher unit costs of larger instances.
 82 | - **Data fits in RAM**: Crucially, loading data from S3 directly to memory is faster than loading data from S3 to an instance volume. If data doesn’t fit in memory, performance can be very bad due to slow local disk IO.
 83 | 
 84 | The importance of the last point is demonstrated by the two medium bars: By increasing the amount of memory available for the tasks - simply by setting `@resources(memory=32000)` - we gain an eightfold increase in S3 throughput!
 85 | 
 86 | The highest bar demonstrated the massive horsepower available on the largest instances: By using `metaflow.S3`, we can reach a beast-mode throughput of 32 GBit/s, which can easily beat local SSDs on laptops. In other words, loading data from S3 to an EC2 instance can be faster than loading data locally on a laptop, or loading it from an EBS volume, an ephemeral instance disk, or EFS.
 87 | 
 88 | New feature: tmpfs support for Batch and Kubernetes
 89 | ---------------------------------------------------
 90 | 
 91 | We have been using the above fast data pattern successfully for years to power Metaflow workflows handling terabytes of data. However, this far, there has been a small gotcha.
 92 | 
 93 | While allocating enough memory through `@resources` should guarantee that data can be downloaded quickly, we still rely on the local filesystem to back the files nominally, although the files never hit the disk thanks to caching. Sadly, it is not possible to allocate disk space for tasks on the fly while using AWS Batch. Doing it requires a bit more tedious change in [the instance’s launch template](https://docs.aws.amazon.com/batch/latest/userguide/launch-templates.html).
 94 | 
 95 | To address this issue, recently we implemented support for memory-based `tmpfs` filesystem on Batch and Kubernetes in Metaflow. You can use the feature to create an ephemeral filesystem backed by memory on the fly, without having to change anything on the infrastructure side.
 96 | 
 97 | We made sure that the `metaflow.S3` client is aware of the `tmpfs` volume, so it will automatically use it to speed up downloads when you enable it. To benefit from the new feature, simply add
 98 | 
 99 | ```python
100 | @batch(use_tmpfs=True)
101 | ```
102 | 
103 | for your AWS Batch workloads or
104 | 
105 | ```python
106 | @kubernetes(use_tmpfs=True)
107 | ```
108 | for Kubernetes. There are a few additional features related to `tmpfs` which you can [read more about in the Metaflow documentation](https://docs.metaflow.org/scaling/data#using-metaflows3-for-in-memory-processing).
109 | 
110 | 
111 | From cloud to table
112 | -------------------
113 | 
114 | As shown above, we can load bytes to memory very quickly. To actually use the data, we must also decode the Parquet files to [an in-memory `pyarrow.Table` object](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html). When taking the decoding time into account, the differences are even more striking:
115 | 
116 | ![](_metaflow_fast_data/decode_parquet_chart.png)
117 | 
118 | The configurations are the same as in the first chart above. We added a top-of-the-line M2 Macbook Pro (the purple bar) as a comparison point. The Macbook loads data from a local SSD, not from S3 as the instances do.
119 | 
120 | As shown above, a powerful Macbook beats small and medium-size instances when the instances are using a local disk, but even a medium-size instance beats the Macbook when using in-memory processing. In this case, a large instance can load a table about 8x faster than a Macbook.
121 | 
122 | This result is a combination of multiple factors: The larger instances have more CPU cores, more memory, and more IO bandwidth available than a Macbook. Naturally, you must utilize the resources efficiently to gain the maximum performance: Use [metaflow.S3.get_many](https://docs.metaflow.org/api/S3#S3.get_many) to maximize network throughput and multithreaded decoding to benefit from all the available CPU cores, as exemplified by [the `table_loader.py` utility module](https://github.com/outerbounds/fast-data-blog/blob/main/table_loader.py).
123 | 
124 | 
125 | From table to results
126 | ---------------------
127 | 
128 | With [a `pyarrow.Table` object](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html) in memory, the possibilities are many! We highlight three common use cases below.
129 | 
130 | Crucially, all the libraries listed below support zero-copy instantiation from a `pyarrow.Table`, so you don’t have to reserve a large amount of extra `@resources(memory=)` just to handle extra copies or inefficient internal representations. Ballooning memory overhead used to be a problem in the past with older Pandas, prior to Arrow.
131 | 
132 | 
133 | You can code along with the following sections using [this repository](https://github.com/outerbounds/fast-data-blog/).
134 | 
135 | ### Dataframes
136 | 
137 | Moving to Pandas is trivial and fast:
138 | 
139 | ```python
140 | df = table.to_pandas()
141 | ```
142 | 
143 | You can add a `zero_copy_only=True` flag to make sure that you will get warned if the conversion requires extra copies of data. Since [the Pandas 2.0 release](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i) that came out just a month ago, Pandas is fully compatible with Arrow data types, so working with Arrow and Pandas should be smooth sailing.
144 | 
145 | While Pandas provides a familiar API, its operations are not always the fastest. You can often enjoy a higher performance by using [Polars - a high-performance dataframe library](https://pola.rs):
146 | 
147 | ```python
148 | import polars as pl
149 | df = pl.from_arrow(table)
150 | ```
151 | 
152 | ### SQL
153 | 
154 | Sometimes it is more convenient to use SQL for data processing than dataframe APIs. For instance, you can follow this
155 | pattern to enable clear division of responsibilities between data engineers and data scientists:
156 | 
157 | ![](_metaflow_fast_data/data-flow.png)
158 | <Caption>
159 | Diagram adapted from the book, <a href="https://www.amazon.com/Effective-Data-Science-Infrastructure-scientists/dp/1617299197/">Effective Data Science Infrastructure</a> 
160 | </Caption>
161 | 
162 | In this scenario, data engineers are responsible for maintaining company-wide, authoritative fact tables that individual projects can rely on. Projects may create their own interpretations of data through their own lightweight ETL process e.g. for feature transformations.
163 | 
164 | The project-specific ETL can run within their Metaflow workflows using [a small but surprisingly powerful in-process database, DuckDB](https://duckdb.org/). A benefit of this approach is that data scientists can own and iterate on their specific ETL very quickly, even testing it locally on their workstations.
165 | 
166 | Using this pattern is not harder than using a dataframe:
167 | 
168 | ```python
169 | import duckdb
170 | query = "SELECT * FROM arrow_table"
171 | relation = duckdb.arrow(arrow_table)
172 | df = relation.query('arrow_table', query).to_df()
173 | ```
174 | 
175 | To familiarize with using DuckDB and Metaflow together in a machine learning context, you can follow along with this [recommender systems tutorial](https://outerbounds.com/docs/recsys-tutorial-L2/).
176 | 
177 | ### Interfacing with ML libraries
178 | 
179 | Many modern ML libraries are well optimized when it comes to utilizing CPU and GPU resources. However, to benefit
180 | from high-performance training and inferencing, you need to be able to feed data to the model fast.
181 | 
182 | Luckily, Arrow data is [readily convertible to Pandas and NumPy](https://arrow.apache.org/docs/python/numpy.html) which are supported by all major ML libraries. [Through NumPy, you can also convert data to various tensor objects](https://outerbounds.com/docs/python-tabular-data-structures/).
183 | 
184 | To illustrate the pattern, [we included a simple ML example](https://github.com/outerbounds/fast-data-blog/blob/main/fast_data_modeling.py) that shows how to convert an Arrow table to a Pandas dataframe, and feed this object to [a LightGBM model](https://lightgbm.readthedocs.io/). The flow trains a model to predict a target variable based on 300 finance instruments.
185 | 
186 | You can run the code easily by yourself and [visualize the results through a Metaflow card](https://docs.metaflow.org/metaflow/visualizing-results), [like this one](https://cards.outerbounds.dev/index.html?id=pw-1347686565%2F0e5a2797fce749c5):
187 | 
188 | <iframe src="https://cards.outerbounds.dev/index.html?id=pw-1347686565%2F0e5a2797fce749c5" width={1000} height={800}></iframe>
189 | 
190 | Summary
191 | -------
192 | 
193 | This post outlined patterns that you can use to load tabular data from S3 to your Metaflow workflows quickly and process it in various ways using Arrow-compatible libraries.
194 | 
195 | In future posts, we will dive deeper into related topics around data: How to use data engineering tools like `dbt` together with Metaflow, how to handle large datasets in a horizontally scalable manner using the patterns highlighted here, and how to make sure that the pattern comply with your data governance policies.
196 | 
197 | You can test patterns highlighted here in [the Metaflow sandbox](https://outerbounds.com/sandbox). If you have any
198 | feedback, questions, or other thoughts around data, [join us and thousands of other data scientists and engineers on Metaflow Slack](http://slack.outerbounds.co)!
199 | 
200 | ---
201 | 
202 | PS. If open-source data topics pique your interest, join [our upcoming event on The Open-Source Modern Data Stack on June 7th.](https://www.eventbrite.com/e/the-open-source-modern-data-stack-tickets-630584333717)
203 | 
204 | <a href="https://www.eventbrite.com/e/the-open-source-modern-data-stack-tickets-630584333717">
205 | 
206 | ![](../static/assets/tabular-fs-chat-event.png)
207 | 
208 | </a>


--------------------------------------------------------------------------------
/data/test-data/ob/docs/data-science/compute/use_multiple_cpu_cores.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Use Multiple CPU Cores
  3 | slug: /docs/use-multiple-cpu-cores
  4 | tags: [versioning]
  5 | sidebar_label: 🍃  Use Multiple Cores
  6 | id: use-multiple-cpu-cores
  7 | pagination_next: null
  8 | pagination_prev: null
  9 | description: How can I use multiple CPU cores inside of Metaflow tasks?
 10 | category: data science
 11 | hide_table_of_contents: false
 12 | ---
 13 | 
 14 | 
 15 | <!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! Instead, edit the notebook w/the location & name as this file. -->
 16 | 
 17 | ## Question
 18 | How can I use multiple CPU cores within Metaflow tasks?
 19 | 
 20 | ## Answer
 21 | You can use multiple cores like you normally would in your Python code in a Metaflow task. 
 22 | For example, you can use Python's [concurrent.future](https://docs.python.org/3/library/concurrent.futures.html) module to launch multiple processes and threads.
 23 | 
 24 | 
 25 | 
 26 | ### <NumberHeading number={1}>Write Flow with Concurrency in a Task</NumberHeading>
 27 | 
 28 | 
 29 | 
 30 | 
 31 | This flow shows how to:
 32 | - Call the `do_task` function four times on different cores using Python's built-in `concurrent.futures.ThreadPoolExecutor`. 
 33 | - Print the total time elapsed to demonstrate the tasks ran in parallel.
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | ```py title="use_multiple_cpu_cores.py"
 42 | from metaflow import FlowSpec, step, Parameter
 43 | from concurrent import futures
 44 | import time
 45 | 
 46 | def do_task(name):
 47 |     time.sleep(3) # a long-running task
 48 |     return name
 49 | 
 50 | class MulticoreFlow(FlowSpec):
 51 | 
 52 |     #highlight-next-line
 53 |     num_cores = Parameter('num-cores', default=4)
 54 | 
 55 |     @step
 56 |     def start(self):
 57 |         threads = [
 58 |             "thread_%s" % i 
 59 |             for i in range(self.num_cores)
 60 |         ]
 61 |         t0 = time.time()
 62 |         #highlight-start
 63 |         with futures.ThreadPoolExecutor(
 64 |             max_workers = self.num_cores
 65 |         ) as exe:
 66 |             self.messages = [
 67 |                 msg for msg in exe.map(do_task, threads)
 68 |             ]
 69 |         #highlight-end
 70 |         self.time_elapsed = time.time() - t0
 71 |         self.next(self.end)
 72 | 
 73 |     @step
 74 |     def end(self):
 75 |         print_msg = "All tasks completed in %.3fs"
 76 |         print(print_msg % self.time_elapsed)
 77 |         [print(msg) for msg in self.messages]
 78 | 
 79 | if __name__ == '__main__':
 80 |     MulticoreFlow()
 81 | ```
 82 | 
 83 | 
 84 | 
 85 | ### <NumberHeading number={2}>Run Flow</NumberHeading>
 86 | 
 87 | 
 88 | 
 89 | ```bash
 90 | python use_multiple_cpu_cores.py run
 91 | ```
 92 | 
 93 | <CodeOutputBlock lang="bash">
 94 | 
 95 | ```
 96 |      Workflow starting (run-id 1663278003721691):
 97 |      [1663278003721691/start/1 (pid 47937)] Task is starting.
 98 |      [1663278003721691/start/1 (pid 47937)] Task finished successfully.
 99 |      [1663278003721691/end/2 (pid 47941)] Task is starting.
100 |      [1663278003721691/end/2 (pid 47941)] All tasks completed in 3.011s
101 |      [1663278003721691/end/2 (pid 47941)] thread_0
102 |      [1663278003721691/end/2 (pid 47941)] thread_1
103 |      [1663278003721691/end/2 (pid 47941)] thread_2
104 |      [1663278003721691/end/2 (pid 47941)] thread_3
105 |      [1663278003721691/end/2 (pid 47941)] Task finished successfully.
106 |      Done!
107 | ```
108 | 
109 | </CodeOutputBlock>
110 | 
111 | 
112 | 
113 | 
114 | ## Further Reading
115 | * [Parallelization Over Multiple Cores](https://docs.metaflow.org/metaflow/scaling-out-and-up/effortless-scaling-with-kubernetes#parallelization-over-multiple-cores)
116 | * [How to Scale Model Training and Tuning to GPU](/docs/scale-model-training-and-tuning/)
117 | 


--------------------------------------------------------------------------------
/data/test-data/ob/docs/data-science/orchestration/dags_for_data_science.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Why Should I Care About DAGs and Workflows in Data Science?
 3 | slug: /docs/dags-in-data-science
 4 | tags: [dag, orchestration]
 5 | sidebar_label: 🌳  Why DAGs?
 6 | id: dags-in-data-science 
 7 | pagination_next: null
 8 | pagination_prev: null
 9 | description: The complexity of machine learning and data science workflows can become complicated quickly. To avoid a jungle of data pipelines and models, we need organizational principles for  workflows, and directed acyclic graphs have become a standard for this
10 | category: data science
11 | hide_table_of_contents: false
12 | ---
13 | 
14 | 
15 | <!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! Instead, edit the notebook w/the location & name as this file. -->
16 | 
17 | ## DAGs and Data Science Workflows
18 | 
19 | The complexity of machine learning and data science workflows can become complicated very quickly. To avoid a jungle of data pipelines and models, we need organizational principles for our workflows, and directed acyclic graphs aka DAGs have become a standard for this:
20 | 
21 | ![](/assets/dags-in-data-science-01.png)
22 | <Caption> Common data science DAG </Caption>
23 | 
24 | DAGs don’t only exist to tame complexity in the abstract, but are practically useful for several reasons:
25 | 
26 | They provide a common vocabulary of steps (the nodes) and transitions between them (the edges), which make it easier to write and understand nontrivial applications that are structured as DAGs.
27 | They both encourage and force us to be explicit about the order in which our workflows are executed. This is particularly useful when the order is anything more complicated than a linear flow, such as you would see in a notebook. Being explicit about the order of operations helps us to manage what could be a jungle of models and data pipelines.
28 | Using DAGs, we can signal when the order of operations doesn’t matter, such as when training independent models. We can parallelize these operations automatically, which is essential for performant code.
29 | 
30 | In summary, it is helpful to view DAGs as a language, not a programming language per se, but rather a formal construct for human-to-human communication. With DAGs, we can speak about complex sequences of operations in a concise and understandable manner.
31 | 
32 | ## What exactly is DAG?
33 | 
34 | It is worth defining explicitly what a DAG is: Directed Acyclic Graph may sound like a mouthful but you can think of DAGs as follows:
35 | 
36 | * A graph is a set of nodes connected by edges
37 | * A graph is directed if each edge is an arrow (that is, pointing from one node to another)
38 | * Acyclic means there are no cycles
39 | 
40 | The nodes of the DAG of a data science or machine learning workflow are the steps in your workflow, such as (but not limited to) getting your data, training a model, or choosing the best model, as in the DAG above. In the case of Metaflow, a step can contain any Python code - like a cell in a notebook. The edges tell you which step(s) follow from other steps and this is why they need to be directed. The reason we don’t want cycles may now be clear: we don’t want our pipelines to end up in infinite loops!
41 | 
42 | In an industrial setting, DAGs can be and often are far more complicated than the toy case above:
43 | 
44 | ![](/assets/dags-in-data-science-02.png)
45 | <Caption> DAGs expand quickly in data science workflows </Caption>
46 | 
47 | To be explicit here, DAGs are important in order to tame the complexity of machine learning workflows. There is an important corollary: not all data science workflows are DAGs, rather, DAGs are a particular type of workflow.
48 | 
49 | 
50 | ## Orchestrating Workflows
51 | 
52 | Once we have a DAG, in order to perform our computation, we require a workflow orchestrator. The workflow orchestrator needs to perform a seemingly simple task: given a workflow or DAG definition, execute the steps defined by the graph in order. The orchestrator walks through the graph, sending each step to a separate compute layer that is responsible for executing the code contained in a step. For data-intense, distributed workflows a compute layer like Spark or Kubernetes can be used, whereas local processes suffice for prototyping.
53 | In contrast to prototyping, production workflows need to run automatically and reliably without human intervention. Workflow orchestrators play key roles in ensuring that production workflows, such as those training models and producing predictions regularly, execute in a scalable and highly available manner. Hence it makes sense to use a battle-hardened, proven system in production such as:
54 | * [Argo](https://argoproj.github.io/), a modern orchestrator that runs natively on Kubernetes
55 | * [Airflow](https://airflow.apache.org/), a popular open-source workflow orchestrator 
56 | * or a low-maintenance managed solution such as [Google Cloud Composer](https://cloud.google.com/composer) or [AWS Step Functions](https://aws.amazon.com/step-functions/?step-functions.sort-by=item.additionalFields.postDateTime&step-functions.sort-order=desc)
57 | 
58 | ## Developing Workflows
59 | Earlier, data science applications were often developed as independent modules which were glued together as a DAG afterwards, sometimes even by a separate team. As a result, it was hard to ensure that the whole application worked correctly end to end. Debugging workflow failures was often a painful experience as issues couldn’t be easily reproduced and fixed locally.
60 | A better approach is to consider the whole workflow as a key deliverable of a data science project, which is developed, tested, and deployed as a functional unit. When using a local orchestrator like the one provided by Metaflow, you can rapidly prototype end-to-end workflows almost as easily as how you develop code in a notebook. After the workflow works locally, it can be deployed to a production orchestrator without any changes in the code. After all, a data science workflow is just a DAG of steps that can be orchestrated by many different systems.
61 | 
62 | 
63 | 
64 | ## How do I? 
65 | 
66 | [Pass artifacts between steps in Metaflow flows](/docs/pass-artifacts-between-steps)
67 | 
68 | ---
69 | 
70 | [Schedule Metaflow flows on AWS Step Functions](/docs/schedule-flow-on-step-functions)
71 | 
72 | ---
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: rag-demo
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python
 6 |   - pip
 7 |   - pandas
 8 |   - metaflow
 9 |   - matplotlib
10 |   - seaborn
11 |   - jupyter
12 |   - pip:
13 |     - sentence-transformers
14 |     - openai
15 |     - lancedb
16 |     - python-frontmatter
17 |     - gitpython
18 |     - git+https://github.com/outerbounds/rag-demo
19 |     - python-slugify
20 |     - tldextract
21 |     - llama-index==0.8.0
22 |     - llama-cpp-python==0.1.77
23 |     - sentence-transformers
24 |     - openai
25 |     - llama-index
26 |     - lancedb
27 |     - pinecone-client[grpc]
28 |     - streamlit
29 |     # - fashion-clip


--------------------------------------------------------------------------------
/flows/config/repo_params.py:
--------------------------------------------------------------------------------
 1 | SAMPLE_OSS_MARKDOWN_REPOS = [
 2 |     {
 3 |         "deployment_url": "docs.metaflow.org",
 4 |         "repository_path": "https://github.com/Netflix/metaflow-docs",
 5 |         "repository_ref": "master",
 6 |         "base_search_path": "docs",
 7 |         "exclude_paths": ["docs/v"],
 8 |         "exclude_files": ["README.md", "README"],
 9 |     },
10 |     {
11 |         "deployment_url": "https://huggingface.co/docs/accelerate",
12 |         "repository_path": "https://github.com/huggingface/accelerate",
13 |         "repository_ref": "main",
14 |         "base_search_path": "docs",
15 |         "exclude_paths": [],
16 |         "exclude_files": [],
17 |     }
18 | ]


--------------------------------------------------------------------------------
/flows/data_table_processor.py:
--------------------------------------------------------------------------------
  1 | from metaflow import FlowSpec, step, Parameter, card, current, Flow, trigger_on_finish
  2 | from metaflow.cards import Image, Markdown
  3 | from matplotlib import pyplot as plt
  4 | import seaborn as sns
  5 | 
  6 | sns.set_style("dark")
  7 | COLORS = {
  8 |     "purple": "#98A1E1",
  9 |     "light-purple": "#DADEFB",
 10 |     "gold": "#F0C054"
 11 | }
 12 | 
 13 | 
 14 | @trigger_on_finish(flow='MarkdownChunker')
 15 | class DataTableProcessor(FlowSpec):
 16 | 
 17 |     save_processed_df = Parameter(
 18 |         "save_processed_df",
 19 |         help="Whether to save the processed dataframe to the run.",
 20 |         default=True,
 21 |         type=bool,
 22 |     )
 23 | 
 24 |     data_dir = Parameter(
 25 |         "data_dir",
 26 |         help="The directory to save the processed dataframe to.",
 27 |         default="data",
 28 |         type=str,
 29 |     )
 30 | 
 31 |     parent_flow = Parameter(
 32 |         "parent_flow",
 33 |         help="The flow id of the parent flow to process.",
 34 |         default='MarkdownChunker',
 35 |         type=str,
 36 |     )
 37 | 
 38 |     n_bins = Parameter(
 39 |         "n_bins",
 40 |         help="The number of bins to use in the histogram.",
 41 |         default=100,
 42 |         type=int,
 43 |     )
 44 | 
 45 |     word_count_threshold = Parameter(
 46 |         "word_count_threshold",
 47 |         help="The word count threshold to use in the histogram.",
 48 |         default=10,
 49 |         type=int,
 50 |     )
 51 | 
 52 |     char_count_threshold = Parameter(
 53 |         "char_count_threshold",
 54 |         help="The char count threshold to use in processing.",
 55 |         default=25,
 56 |         type=int,
 57 |     )
 58 | 
 59 |     def plot_char_word_histogram(self, char_count_threshold=0, word_count_threshold=0, _df=None, title="", ):
 60 |         fig, ax = plt.subplots(1, 2, figsize=(12, 4))
 61 |         ax[0] = _df.char_count.plot.hist(bins=self.n_bins, color=COLORS['purple'], ax=ax[0])
 62 |         ax[1] = _df.word_count.plot.hist(bins=self.n_bins, color=COLORS['light-purple'], ax=ax[1])
 63 |         if char_count_threshold > 0 or word_count_threshold > 0:
 64 |             ax[0].set_xlabel("Filtered character count > %d" % char_count_threshold)
 65 |             ax[1].set_xlabel("Filtered word count > %d" % word_count_threshold)
 66 |         ax[0].set_xscale("log")
 67 |         ax[1].set_xscale("log")
 68 |         ax[0].set_ylabel("Frequency")
 69 |         ax[0].spines.right.set_visible(False)
 70 |         ax[0].spines.top.set_visible(False)
 71 |         ax[1].spines.right.set_visible(False)
 72 |         ax[1].spines.top.set_visible(False)
 73 |         fig.suptitle(title, fontsize=24)
 74 |         fig.tight_layout()
 75 |         assert fig is not None, "Figure is None, check plot_char_word_histogram."
 76 |         return fig
 77 | 
 78 |     def plot_tld_count(self):
 79 |         fig, ax = plt.subplots(1, 1, figsize=(12, 4))
 80 |         self.processed_df.groupby('tld').count()['index'].sort_values(ascending=False).plot.bar(
 81 |             ax=ax, color=COLORS['gold']
 82 |         )
 83 |         fig.suptitle("Top-level domain representation in the dataset", fontsize=24)
 84 |         ax.set_xticklabels(ax.get_xticklabels(), rotation = 45)
 85 |         ax.set_xlabel("")
 86 |         ax.set_ylabel("Frequency")
 87 |         fig.tight_layout()
 88 |         assert fig is not None, "Figure is None, check plot_tld_count."
 89 |         return fig
 90 | 
 91 |     @card
 92 |     @step
 93 |     def start(self):
 94 | 
 95 |         parent_flow = Flow(self.parent_flow)
 96 |         run = parent_flow.latest_run 
 97 |         
 98 |         if not run.successful:
 99 |             print("Skipping processing of unsuccessful run {}.".format(run.id)) 
100 |             self.processed_df = None
101 | 
102 |         else:
103 | 
104 |             current.card.append(Markdown(f"""# Processing data table from run {run.id}"""))
105 | 
106 |             current.card.append(Markdown(f"""## Filtering rows"""))
107 |             df = run.data.df
108 |             fig = self.plot_char_word_histogram(_df = df, title="Before filtering")
109 |             current.card.append(Image.from_matplotlib(fig))
110 | 
111 |             # Filter out rows with less than N words.
112 |             _df = df[df.word_count > self.word_count_threshold]
113 | 
114 |             # Filter out rows with less than M chars.
115 |             _df = _df[_df.char_count > self.char_count_threshold]
116 | 
117 |             # Feature: Add a column for the TLD.
118 |             import tldextract
119 |             def get_tld(url):
120 |                 ext = tldextract.extract(url)
121 |                 if ext.subdomain != '':
122 |                     url = '.'.join(ext[:3])
123 |                 else:
124 |                     url = '.'.join(ext[1:3])
125 |                 return "https://" + url
126 |             _df['tld'] = _df['page_url'].apply(get_tld)
127 | 
128 |             # Reindex and keep index in upstream dataframe.
129 |             _df.reset_index(inplace=True)
130 |             _df.index = range(len(_df))
131 | 
132 |             fig = self.plot_char_word_histogram(
133 |                 word_count_threshold=self.word_count_threshold, 
134 |                 char_count_threshold=self.char_count_threshold,
135 |                 _df = _df, title="After filtering"
136 |             )
137 |             current.card.append(Image.from_matplotlib(fig))
138 | 
139 |             ### ADD MORE FILTERS HERE.
140 |             print("Filtered dataframe from shape {} to shape {}.".format(
141 |                 df.shape, _df.shape))
142 |             self.processed_df = _df
143 | 
144 |             # Plot the number of rows per TLD.
145 |             fig = self.plot_tld_count()
146 |             current.card.append(Image.from_matplotlib(fig))
147 | 
148 |             ### ADD MORE SUMMARY STATS HERE.
149 | 
150 |         self.next(self.end)
151 | 
152 |     @step
153 |     def end(self):
154 | 
155 |         import os
156 | 
157 |         print("The {} run {} has ended, with a dataframe of shape: {}".format(
158 |             current.flow_name, current.run_id,
159 |             self.processed_df.shape))
160 |         print(
161 |             f"""
162 |             You can now use the dataframe to do whatever you want.
163 |             To load it in a notebook, you can use the following code:
164 | 
165 |                 from metaflow import Flow, namespace
166 |                 namespace('{current.namespace}')
167 |                 run = Run('{current.flow_name}/{current.run_id}')
168 |                 processed_df = run.data.processed_df
169 |                 print(processed_df.shape)
170 |         """
171 |         )
172 | 
173 |         if self.save_processed_df:
174 |             self.processed_df_path = os.path.abspath(
175 |                 os.path.join(self.data_dir, 'processed_df_%s.csv' % current.run_id))
176 |             print("Saving processed dataframe to %s" % self.processed_df_path)
177 |             self.processed_df.to_csv(self.processed_df_path, index=False)
178 | 
179 | 
180 | if __name__ == '__main__':
181 |     DataTableProcessor()


--------------------------------------------------------------------------------
/flows/lancedb_index.py:
--------------------------------------------------------------------------------
 1 | from metaflow import FlowSpec, step, Flow, current
 2 | 
 3 | class LanceDBVectorIndexer(FlowSpec):
 4 | 
 5 |     table_name = "test"
 6 |     embedding_model = "paraphrase-MiniLM-L6-v2"
 7 |     embedding_target_col_name = "contents"
 8 | 
 9 |     def find_processed_df(self):
10 |         for run in Flow('DataTableProcessor'):
11 |             if run.data.save_processed_df:
12 |                 print("Found processed df in run: {}".format(run.id))
13 |                 return run.data.processed_df
14 | 
15 |     @step
16 |     def start(self):
17 |         self.next(self.create_index)
18 | 
19 |     @step
20 |     def create_index(self):
21 | 
22 |         from rag_tools.databases.vector_database import LanceDB
23 |         from rag_tools.embedders.embedder import SentenceTransformerEmbedder
24 |         import pandas as pd
25 | 
26 |         # fetch data and embed it
27 |         self.data = self.find_processed_df()
28 |         encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu")
29 |         docs = self.data[self.embedding_target_col_name].tolist()
30 |         self.ids = list(range(1, len(docs) + 1))
31 |         embeddings = encoder.embed(docs)
32 |         self.dimension = len(embeddings[0])
33 | 
34 |         # put the vectors in the index
35 |         db = LanceDB()
36 |         db.create_index(self.table_name, embeddings, docs, self.ids)
37 | 
38 |         self.next(self.end) 
39 | 
40 |     @step
41 |     def end(self):
42 | 
43 |         from rag_tools.databases.vector_database import LanceDB
44 |         from rag_tools.embedders.embedder import SentenceTransformerEmbedder
45 | 
46 |         db = LanceDB()
47 | 
48 |         # search the index in a test query
49 |         K = 3
50 |         test_prompt = "aws"
51 |         encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu")
52 |         self.search_vector = encoder.embed([test_prompt])[0]
53 |         self.results = db.vector_search(self.table_name, self.search_vector, k=K)
54 | 
55 |         print(f"""
56 |         Access flow results with:
57 | 
58 |             from metaflow import Run
59 |             run = Run('{current.flow_name}/{current.run_id}')
60 |             results = run.data.results
61 | 
62 |         Resume LanceDBVectorIndexer with:
63 | 
64 |             from rag_tools.databases.vector_database import LanceDB
65 |             db = LanceDB() # default storage location is `../../chatbot.lance`, relative to your cwd.
66 |             db.vector_search(table_name, search_vector, k=K)
67 |         """)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     LanceDBVectorIndexer()


--------------------------------------------------------------------------------
/flows/markdown_chunker.py:
--------------------------------------------------------------------------------
 1 | from metaflow import FlowSpec, step, current, schedule
 2 | from rag_tools.filetypes.markdown import Mixin as MarkdownMixin
 3 | 
 4 | @schedule(weekly=True)
 5 | class MarkdownChunker(FlowSpec, MarkdownMixin):
 6 | 
 7 |     @step
 8 |     def start(self):
 9 |         """
10 |         Start the flow.
11 |         Try to download the content from the repository.
12 |         """
13 | 
14 |         # see config.py for the definition of repo_params
15 |         # it is a list of dictionaries, 
16 |         # that tell the Markdown tools where to look for content.
17 |         # see /notebooks/markdonw_repo_explorer.ipynb for more details.
18 |         from config.repo_params import SAMPLE_OSS_MARKDOWN_REPOS
19 | 
20 |         self.repo_params = SAMPLE_OSS_MARKDOWN_REPOS
21 |         self.df = self.load_df_from_repo_list()
22 |         self.next(self.end)
23 | 
24 |     @step
25 |     def end(self):
26 |         print("The flow has ended, with a dataframe of shape: {}".format(self.df.shape))
27 |         print(
28 |             f"""
29 |             You can now use the dataframe to do whatever you want.
30 |             To load it in a notebook, you can use the following code:
31 | 
32 |                 from metaflow import Flow, namespace
33 |                 namespace('{current.namespace}')
34 |                 run = Run('{current.flow_name}/{current.run_id}')
35 |                 df = run.data.df
36 |                 print(df.shape)
37 |         """)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     MarkdownChunker()


--------------------------------------------------------------------------------
/flows/pinecone_index.py:
--------------------------------------------------------------------------------
 1 | from metaflow import FlowSpec, step, Flow, environment
 2 | import os
 3 | 
 4 | env_vars = {
 5 |     'PINECONE_API_KEY': os.environ['PINECONE_API_KEY'], 
 6 |     'GCP_ENVIRONMENT': os.environ['GCP_ENVIRONMENT']
 7 | }
 8 | 
 9 | class PineconeVectorIndexer(FlowSpec):
10 | 
11 |     index_name = "test"
12 |     embedding_model = "paraphrase-MiniLM-L6-v2"
13 |     embedding_target_col_name = "contents"
14 | 
15 |     def find_processed_df(self):
16 |         for run in Flow('DataTableProcessor'):
17 |             if run.data.save_processed_df:
18 |                 print("Found processed df in run: {}".format(run.id))
19 |                 return run.data.processed_df
20 | 
21 |     @step
22 |     def start(self):
23 |         self.next(self.create_index)
24 | 
25 |     @environment(vars=env_vars)
26 |     @step
27 |     def create_index(self):
28 | 
29 |         from rag_tools.databases.vector_database import PineconeDB
30 |         from rag_tools.embedders.embedder import SentenceTransformerEmbedder
31 |         import pandas as pd
32 | 
33 |         # fetch data and embed it
34 |         self.data = self.find_processed_df()
35 |         encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu")
36 |         docs = self.data[self.embedding_target_col_name].tolist()
37 |         self.ids = list(range(1, len(docs) + 1))
38 |         embeddings = encoder.embed(docs)
39 |         self.dimension = len(embeddings[0])
40 | 
41 |         # create the index
42 |         db = PineconeDB()
43 |         db.create_index(self.index_name, dimension=self.dimension)
44 | 
45 |         # put the vectors in the index
46 |         db.upsert(self.index_name, embeddings, docs, self.ids)
47 | 
48 |         self.next(self.end) 
49 | 
50 |     @environment(vars=env_vars)
51 |     @step
52 |     def end(self):
53 | 
54 |         from rag_tools.databases.vector_database import PineconeDB
55 |         from rag_tools.embedders.embedder import SentenceTransformerEmbedder
56 | 
57 |         # create_index is idempotent
58 |         db = PineconeDB()
59 |         db.create_index(self.index_name, dimension=self.dimension)
60 | 
61 |         # search the index in a test query
62 |         K = 3
63 |         test_prompt = "aws"
64 |         encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu")
65 |         self.search_vector = encoder.embed([test_prompt])[0]
66 |         self.results = db.vector_search(self.index_name, self.search_vector, k=K).to_dict()
67 | 
68 |         for result in self.results['matches']:
69 |             print("\n\nid: {} - score: {} \n\n{}\n\n".format(result['id'], result['score'], result['metadata']['text']))
70 |             print("===============================================")
71 | 
72 |         print("\n\n Flow is done, check for results in the {} index at https://app.pinecone.io/.".format(self.index_name))
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     PineconeVectorIndexer()


--------------------------------------------------------------------------------
/flows/test_simple_embedding.py:
--------------------------------------------------------------------------------
 1 | from rag_tools.databases.vector_database import LanceDB, PineconeDB
 2 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder
 3 | from metaflow import Flow
 4 | import pandas as pd
 5 | 
 6 | def find_processed_df():
 7 |     for run in Flow('DataTableProcessor'):
 8 |         if run.data.save_processed_df:
 9 |             print("Found processed df in run: {}".format(run.id))
10 |             return run.data.processed_df
11 | 
12 | # fetch data and embed it
13 | data = find_processed_df()
14 | encoder = SentenceTransformerEmbedder("paraphrase-MiniLM-L6-v2", device="cpu")
15 | docs = data["contents"].tolist()
16 | ids = list(range(1, len(docs) + 1))
17 | embeddings = encoder.embed(docs)
18 | 
19 | DB = "lance"
20 | 
21 | if DB == "lance":
22 |     db = LanceDB()
23 |     db.create_index("test", embeddings, docs, ids)
24 | 
25 | elif DB == "pinecone":
26 |     db = PineconeDB()
27 |     db.create_index("test", dimension=len(embeddings[0]))
28 |     db.upsert("test", embeddings, docs, ids)
29 | 
30 | # search the index
31 | K = 3
32 | user_prompt = "aws"
33 | search_vector = encoder.embed([user_prompt])[0]
34 | results = db.vector_search("test", search_vector, k=K) #.to_dict()
35 | 
36 | for result in results['matches']:
37 |     print("id: {} - distance score: {} \n\n{}\n\n".format(result['id'], result['score'], result['metadata']['text']))
38 | 
39 | db.destroy_index("test")


--------------------------------------------------------------------------------
/flows/test_simple_query.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from rag_tools.databases.vector_database import LanceDB
 3 | from rag_tools.llms.llms_search import ChatGPTWrapper
 4 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder
 5 | 
 6 | # import lancedb
 7 | # URI = "data/sample-lancedb"
 8 | # db = lancedb.connect(URI)
 9 | 
10 | query = "How do I specify conda dependencies in my flow?"
11 | 
12 | # embed with sentence transformer
13 | encoder = SentenceTransformerEmbedder("paraphrase-MiniLM-L6-v2", device="cpu")
14 | search_vector = encoder.embed([query])[0]
15 | 
16 | # embed with sentence transformer
17 | best_text = db.vector_search(search_vector, k=2)
18 | 
19 | # we build some context for the question
20 | text = "\n\n".join(best_text['text'].tolist())
21 | 
22 | # query prompt for chatgpt
23 | prompt = f"Please answer this question {query}\n\nhere's the context you should use:\n\n{text}.\n\nIf the answer is not provided in the context, answer I don't know."
24 | 
25 | output = ChatGPTWrapper(os.environ['OPENAI_API_KEY']).sample(prompt)
26 | 
27 | print(f"Question: {prompt}")
28 | print()
29 | print(f"Answer: {output}")


--------------------------------------------------------------------------------
/images/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/images/cover.png


--------------------------------------------------------------------------------
/notebooks/llama_index_exploration.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Set your API Key"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": null,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": [
  16 |     "# %env OPENAI_API_KEY=<Fetch at: https://platform.openai.com/account/api-keys>"
  17 |    ]
  18 |   },
  19 |   {
  20 |    "cell_type": "markdown",
  21 |    "metadata": {},
  22 |    "source": [
  23 |     "# Import dependencies"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "code",
  28 |    "execution_count": 61,
  29 |    "metadata": {},
  30 |    "outputs": [],
  31 |    "source": [
  32 |     "# built-ins\n",
  33 |     "from pathlib import Path\n",
  34 |     "from functools import partial\n",
  35 |     "import os\n",
  36 |     "from typing import Tuple, List\n",
  37 |     "from dataclasses import dataclass\n",
  38 |     "\n",
  39 |     "# custom lib\n",
  40 |     "from rag_tools.repo.ops import ensure_repo_exists_locally, DocumentationExtractor\n",
  41 |     "\n",
  42 |     "# third party libs\n",
  43 |     "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, Document, download_loader\n",
  44 |     "from llama_index.retrievers import VectorIndexRetriever\n",
  45 |     "from llama_index.response_synthesizers import get_response_synthesizer\n",
  46 |     "from llama_index.llms import OpenAI\n",
  47 |     "from metaflow import Flow\n",
  48 |     "import pandas as pd\n",
  49 |     "from IPython.display import display, Markdown"
  50 |    ]
  51 |   },
  52 |   {
  53 |    "cell_type": "markdown",
  54 |    "metadata": {},
  55 |    "source": [
  56 |     "# Define utility functions"
  57 |    ]
  58 |   },
  59 |   {
  60 |    "cell_type": "code",
  61 |    "execution_count": 11,
  62 |    "metadata": {},
  63 |    "outputs": [],
  64 |    "source": [
  65 |     "@dataclass\n",
  66 |     "class Context:\n",
  67 |     "    def __init__(self, response: str, source_node_ids: List[str]):\n",
  68 |     "        self.response = response\n",
  69 |     "        self.source_node_ids = source_node_ids\n",
  70 |     "\n",
  71 |     "    def get_link_df(self, meta_df, link_col = 'doc_id'):\n",
  72 |     "        return meta_df[meta_df[link_col].isin(self.source_node_ids)]\n",
  73 |     "\n",
  74 |     "def qa_iter(\n",
  75 |     "    question: str, \n",
  76 |     "    index: VectorStoreIndex, \n",
  77 |     "    k:int = 2, \n",
  78 |     "    response_mode:str = 'tree_summarize'\n",
  79 |     ") -> Context:\n",
  80 |     "    \"Match a question against an index and returns the response.\"\n",
  81 |     "    retriever = VectorIndexRetriever(index=index, similarity_top_k=k)\n",
  82 |     "    response_synthesizer = get_response_synthesizer(response_mode=response_mode)\n",
  83 |     "    query_engine = index.as_query_engine(response_synthesizer=response_synthesizer, retriever=retriever)\n",
  84 |     "    query_res = query_engine.query(question)\n",
  85 |     "    return Context(\n",
  86 |     "        response=query_res.response, source_node_ids=list(query_res.metadata.keys())\n",
  87 |     "    )\n",
  88 |     "\n",
  89 |     "dm = lambda x: display(Markdown(x))\n",
  90 |     "def dmqa(q, a): \n",
  91 |     "    dm(f\"\"\"\n",
  92 |     "**Question:** {q}\n",
  93 |     "\n",
  94 |     "**Answer:** {a}\n",
  95 |     "\"\"\"\n",
  96 |     ")\n",
  97 |     "\n",
  98 |     "def nb_output_format(question, response, similar_chunk_df):\n",
  99 |     "    dm(f\"#### {question}\")\n",
 100 |     "    dm(f\"**Retrieved Response**\")\n",
 101 |     "    dm(response)\n",
 102 |     "    dm(f\"#### Sources\")\n",
 103 |     "    for _, chunk in similar_chunk_df.iterrows():\n",
 104 |     "        dm(f\"##### [{chunk.header}]({chunk.page_url})\")\n",
 105 |     "        dm(f\"{chunk.contents[:100]}...\")\n",
 106 |     "\n",
 107 |     "def get_documents_from_content_section_df(df):\n",
 108 |     "    ids = []; documents = []\n",
 109 |     "    for i, text in enumerate(df.contents):\n",
 110 |     "        doc = Document(text=text, id_=i)\n",
 111 |     "        documents.append(doc)\n",
 112 |     "        ids.append(doc.id_)\n",
 113 |     "    return documents, ids\n",
 114 |     "\n",
 115 |     "def generative_search_engine_iter(question, index, meta_df, meta_df_id_col='doc_id'):\n",
 116 |     "    \"Assumes index and df are defined in the global scope\"\n",
 117 |     "    context = qa_iter(question, index)\n",
 118 |     "    similar_chunk_df = meta_df[meta_df[meta_df_id_col].isin(context.source_node_ids)]\n",
 119 |     "    nb_output_format(question, context.response, similar_chunk_df)\n",
 120 |     "\n",
 121 |     "def get_documents_from_md_file_paths(fps):\n",
 122 |     "    MarkdownReader = download_loader(\"MarkdownReader\")\n",
 123 |     "    loader = MarkdownReader()\n",
 124 |     "    documents = []\n",
 125 |     "    for fp in fps:\n",
 126 |     "        documents += loader.load_data(file=Path(fp))\n",
 127 |     "    return documents"
 128 |    ]
 129 |   },
 130 |   {
 131 |    "cell_type": "markdown",
 132 |    "metadata": {},
 133 |    "source": [
 134 |     "# 🛑 Set variables based on your machine's setup"
 135 |    ]
 136 |   },
 137 |   {
 138 |    "cell_type": "code",
 139 |    "execution_count": 4,
 140 |    "metadata": {},
 141 |    "outputs": [],
 142 |    "source": [
 143 |     "# these are part of this repo, so you don't need to change\n",
 144 |     "DATA_DIR='../data'\n",
 145 |     "LLAMA_INDEX_TUTORIAL_DATA = os.path.join(DATA_DIR, 'llama-index-tutorial')\n",
 146 |     "\n",
 147 |     "# this is unique to your machine. where did you clone https://github.com/Netflix/metaflow-docs to?\n",
 148 |     "YOUR_LOCAL_METAFLOW_DOCS_REPO_PATH = os.path.expanduser(\"~/Dev/metaflow-docs\")"
 149 |    ]
 150 |   },
 151 |   {
 152 |    "cell_type": "markdown",
 153 |    "metadata": {},
 154 |    "source": [
 155 |     "# Which GitHub repos?"
 156 |    ]
 157 |   },
 158 |   {
 159 |    "cell_type": "code",
 160 |    "execution_count": 5,
 161 |    "metadata": {},
 162 |    "outputs": [],
 163 |    "source": [
 164 |     "REPO_PARAMS = [\n",
 165 |     "    {\n",
 166 |     "        \"deployment_url\": \"docs.metaflow.org\",\n",
 167 |     "        \"repository_path\": \"https://github.com/Netflix/metaflow-docs\",\n",
 168 |     "        \"repository_ref\": \"master\",\n",
 169 |     "        \"base_search_path\": \"docs\",\n",
 170 |     "        \"exclude_paths\": [\"docs/v\"],\n",
 171 |     "        \"exclude_files\": [\"README.md\", \"README\"],\n",
 172 |     "    }\n",
 173 |     "]"
 174 |    ]
 175 |   },
 176 |   {
 177 |    "cell_type": "markdown",
 178 |    "metadata": {},
 179 |    "source": [
 180 |     "# 1. Llama Index Warmup"
 181 |    ]
 182 |   },
 183 |   {
 184 |    "cell_type": "code",
 185 |    "execution_count": 63,
 186 |    "metadata": {},
 187 |    "outputs": [],
 188 |    "source": [
 189 |     "# This cell will trigger llama_index to look for OPENAI_API_KEY in environment variables,\n",
 190 |     "# then default to downloading a llama 2 model binary locally.\n",
 191 |     "\n",
 192 |     "# this thing is looking for a .txt file in the data dir.\n",
 193 |     "documents = SimpleDirectoryReader(LLAMA_INDEX_TUTORIAL_DATA).load_data()\n",
 194 |     "\n",
 195 |     "# Indexing is the first, and most crucial, stage in a RAG workflow.\n",
 196 |     "# It is the process of converting a set of documents into a vector representation.\n",
 197 |     "# This vector representation is later used to retrieve relevant documents for a given query.\n",
 198 |     "service_context = ServiceContext.from_defaults(\n",
 199 |     "    llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.0)\n",
 200 |     ")\n",
 201 |     "index = VectorStoreIndex.from_documents(documents, service_context=service_context) "
 202 |    ]
 203 |   },
 204 |   {
 205 |    "cell_type": "code",
 206 |    "execution_count": 64,
 207 |    "metadata": {},
 208 |    "outputs": [
 209 |     {
 210 |      "data": {
 211 |       "text/markdown": [
 212 |        "The author worked on writing and programming outside of school before college. They wrote short stories and tried writing programs on an IBM 1401 computer. They also built a microcomputer kit and started programming on it, writing simple games and a word processor."
 213 |       ],
 214 |       "text/plain": [
 215 |        "<IPython.core.display.Markdown object>"
 216 |       ]
 217 |      },
 218 |      "metadata": {},
 219 |      "output_type": "display_data"
 220 |     }
 221 |    ],
 222 |    "source": [
 223 |     "# Query engine \"takes in a natural language query, and returns a response, along with reference context retrieved and passed to the LLM.\"\n",
 224 |     "query_engine = index.as_query_engine()\n",
 225 |     "response = query_engine.query(\"What did the author do growing up?\")\n",
 226 |     "dm(response.response)\n",
 227 |     "\n",
 228 |     "# Taking too many minutes to run with Llama 2 on my macbook :( \n",
 229 |     "# Conservatively estimate OpenAI API is ~$1 per dozen end-to-end runs of this notebook."
 230 |    ]
 231 |   },
 232 |   {
 233 |    "cell_type": "markdown",
 234 |    "metadata": {},
 235 |    "source": [
 236 |     "## Try a [Llama Hub tool for parsing `.md` files](https://llamahub.ai/l/file-markdown)\n",
 237 |     "\n",
 238 |     "This could be used in conjunction or in place of the custom markdown parser used in `./markdown_chunker.py`"
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "code",
 243 |    "execution_count": 40,
 244 |    "metadata": {},
 245 |    "outputs": [],
 246 |    "source": [
 247 |     "from pathlib import Path\n",
 248 |     "from llama_index import download_loader\n",
 249 |     "import os"
 250 |    ]
 251 |   },
 252 |   {
 253 |    "cell_type": "code",
 254 |    "execution_count": 41,
 255 |    "metadata": {},
 256 |    "outputs": [],
 257 |    "source": [
 258 |     "MarkdownReader = download_loader(\"MarkdownReader\")\n",
 259 |     "loader = MarkdownReader()\n",
 260 |     "\n",
 261 |     "# start with a single document\n",
 262 |     "test_path = os.path.abspath(\"%s/test-data/ob/blog/metaflow-fast-data.md\" % DATA_DIR)\n",
 263 |     "documents = loader.load_data(file=Path(test_path))"
 264 |    ]
 265 |   },
 266 |   {
 267 |    "cell_type": "code",
 268 |    "execution_count": 65,
 269 |    "metadata": {},
 270 |    "outputs": [],
 271 |    "source": [
 272 |     "index = VectorStoreIndex.from_documents(documents, service_context=service_context)"
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "markdown",
 277 |    "metadata": {},
 278 |    "source": [
 279 |     "## Ask some questions over the index"
 280 |    ]
 281 |   },
 282 |   {
 283 |    "cell_type": "code",
 284 |    "execution_count": 66,
 285 |    "metadata": {},
 286 |    "outputs": [
 287 |     {
 288 |      "data": {
 289 |       "text/markdown": [
 290 |        "\n",
 291 |        "**Question:** What is the fastest way to load data onto AWS Batch instances using Metaflow?\n",
 292 |        "\n",
 293 |        "**Answer:** The fastest way to load data onto AWS Batch instances using Metaflow would be to utilize the data loading capabilities provided by Metaflow itself. Metaflow offers built-in functionality for handling data loading and processing, allowing you to efficiently transfer and process data on AWS Batch instances. By leveraging Metaflow's data loading features, you can optimize the loading process and ensure efficient utilization of AWS Batch resources.\n"
 294 |       ],
 295 |       "text/plain": [
 296 |        "<IPython.core.display.Markdown object>"
 297 |       ]
 298 |      },
 299 |      "metadata": {},
 300 |      "output_type": "display_data"
 301 |     }
 302 |    ],
 303 |    "source": [
 304 |     "question = (\n",
 305 |     "    \"What is the fastest way to load data onto AWS Batch instances using Metaflow?\"\n",
 306 |     ")\n",
 307 |     "\n",
 308 |     "query_engine = index.as_query_engine()\n",
 309 |     "response = query_engine.query(question).response\n",
 310 |     "\n",
 311 |     "dmqa(question, response)"
 312 |    ]
 313 |   },
 314 |   {
 315 |    "cell_type": "code",
 316 |    "execution_count": 68,
 317 |    "metadata": {},
 318 |    "outputs": [
 319 |     {
 320 |      "data": {
 321 |       "text/markdown": [
 322 |        "\n",
 323 |        "**Question:** How does Metaflow use `tmpfs`?\n",
 324 |        "\n",
 325 |        "**Answer:** Metaflow uses `tmpfs` to store temporary data during the execution of workflows. `tmpfs` is a temporary file system that resides in memory, which means that the data stored in `tmpfs` is not persisted across system reboots. This makes it ideal for storing temporary data that is only needed during the execution of a workflow and can be discarded afterwards. By using `tmpfs`, Metaflow can achieve faster read and write operations compared to using disk-based storage.\n"
 326 |       ],
 327 |       "text/plain": [
 328 |        "<IPython.core.display.Markdown object>"
 329 |       ]
 330 |      },
 331 |      "metadata": {},
 332 |      "output_type": "display_data"
 333 |     }
 334 |    ],
 335 |    "source": [
 336 |     "question = \"How does Metaflow use `tmpfs`?\"\n",
 337 |     "response = query_engine.query(question).response\n",
 338 |     "dmqa(question, response)"
 339 |    ]
 340 |   },
 341 |   {
 342 |    "cell_type": "markdown",
 343 |    "metadata": {},
 344 |    "source": [
 345 |     "# 2. Controlling hallucinations by curating an index\n",
 346 |     "Here are a few questions we will explore in this section:\n",
 347 |     "\n",
 348 |     "**What is the problem with the above workflow?**\n",
 349 |     "\n",
 350 |     "In the previous section's index, we created all the vectors from chunks of a [post](https://outerbounds.com/blog/metaflow-fast-data/) specifically about the `tmpfs` feature. [Metaflow docs](https://docs.metaflow.org/) don't contain that much content about `tmpfs` yet.\n",
 351 |     "\n",
 352 |     "If we use the Metaflow docs as the source objects to populate the index, and a question is asked to the model about `tmpfs`, how can we know if it is hallucinating it, or referencing an actual piece of content that Metaflow maintainers endorse?\n",
 353 |     "> Bing Chat AI:\n",
 354 |     "Give a bunch of links to the content that LLM response was conditioned on.\n",
 355 |     "\n",
 356 |     "We will build a simple system like this in the next section. First, let's see the power of understanding the domain of our index, and then move to using it as a way to reference source material in the generated response."
 357 |    ]
 358 |   },
 359 |   {
 360 |    "cell_type": "markdown",
 361 |    "metadata": {},
 362 |    "source": [
 363 |     "## Index Metaflow docs"
 364 |    ]
 365 |   },
 366 |   {
 367 |    "cell_type": "code",
 368 |    "execution_count": 69,
 369 |    "metadata": {},
 370 |    "outputs": [],
 371 |    "source": [
 372 |     "file_paths = DocumentationExtractor().filter_files(\n",
 373 |     "    YOUR_LOCAL_METAFLOW_DOCS_REPO_PATH,\n",
 374 |     "    base_search_path = \"docs\",\n",
 375 |     "    exclude_paths = [\"docs/v\"],\n",
 376 |     "    exclude_files = [\"README.md\", \"README\"],\n",
 377 |     "    considered_extensions = [\".md\"],\n",
 378 |     ")"
 379 |    ]
 380 |   },
 381 |   {
 382 |    "cell_type": "code",
 383 |    "execution_count": 70,
 384 |    "metadata": {},
 385 |    "outputs": [],
 386 |    "source": [
 387 |     "file_paths = [abs_path for abs_path, _ in file_paths]"
 388 |    ]
 389 |   },
 390 |   {
 391 |    "cell_type": "code",
 392 |    "execution_count": 71,
 393 |    "metadata": {},
 394 |    "outputs": [],
 395 |    "source": [
 396 |     "documents = get_documents_from_md_file_paths(file_paths)"
 397 |    ]
 398 |   },
 399 |   {
 400 |    "cell_type": "code",
 401 |    "execution_count": 72,
 402 |    "metadata": {},
 403 |    "outputs": [
 404 |     {
 405 |      "name": "stdout",
 406 |      "output_type": "stream",
 407 |      "text": [
 408 |       "Showing sample of 1 out of 500 <class 'llama_index.schema.Document'> objects\n"
 409 |      ]
 410 |     },
 411 |     {
 412 |      "data": {
 413 |       "text/plain": [
 414 |        "[Document(id_='b185a5f6-76a2-4095-9523-adb59d3518c9', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='168707b49cebf5424ed48615defbba683a0d7932f8451d263d491040c37aa87f', text='\\n\\nWelcome to Metaflow\\n\\nMetaflow makes it easy to build and manage real-life data science and machine learning projects.\\n\\n\\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]"
 415 |       ]
 416 |      },
 417 |      "execution_count": 72,
 418 |      "metadata": {},
 419 |      "output_type": "execute_result"
 420 |     }
 421 |    ],
 422 |    "source": [
 423 |     "N = 1\n",
 424 |     "print(\n",
 425 |     "    \"Showing sample of {m} out of {n} {t} objects\".format(\n",
 426 |     "        m=N, n=len(documents), t=type(documents[0])\n",
 427 |     "    )\n",
 428 |     ")\n",
 429 |     "documents[:N]"
 430 |    ]
 431 |   },
 432 |   {
 433 |    "cell_type": "code",
 434 |    "execution_count": 73,
 435 |    "metadata": {},
 436 |    "outputs": [
 437 |     {
 438 |      "name": "stdout",
 439 |      "output_type": "stream",
 440 |      "text": [
 441 |       "CPU times: user 582 ms, sys: 187 ms, total: 770 ms\n",
 442 |       "Wall time: 14.1 s\n"
 443 |      ]
 444 |     }
 445 |    ],
 446 |    "source": [
 447 |     "%%time\n",
 448 |     "index = VectorStoreIndex.from_documents(documents)\n",
 449 |     "# TODO: Measure times as this thing scales with N documents and larger sized documents."
 450 |    ]
 451 |   },
 452 |   {
 453 |    "cell_type": "markdown",
 454 |    "metadata": {},
 455 |    "source": [
 456 |     "## Q&A iterations over the Metaflow docs index"
 457 |    ]
 458 |   },
 459 |   {
 460 |    "cell_type": "code",
 461 |    "execution_count": 74,
 462 |    "metadata": {},
 463 |    "outputs": [
 464 |     {
 465 |      "data": {
 466 |       "text/markdown": [
 467 |        "\n",
 468 |        "**Question:** What is Metaflow?\n",
 469 |        "\n",
 470 |        "**Answer:** Metaflow is a Python library that simplifies the development, deployment, and operation of data-intensive applications, specifically those related to data science and machine learning. It was initially created at Netflix to enhance the efficiency of data scientists working on a range of projects, from traditional statistics to cutting-edge deep learning. Metaflow is an open-source tool released under the Apache License, Version 2.0.\n"
 471 |       ],
 472 |       "text/plain": [
 473 |        "<IPython.core.display.Markdown object>"
 474 |       ]
 475 |      },
 476 |      "metadata": {},
 477 |      "output_type": "display_data"
 478 |     }
 479 |    ],
 480 |    "source": [
 481 |     "question = \"What is Metaflow?\"\n",
 482 |     "context = qa_iter(question, index)\n",
 483 |     "dmqa(question, context.response)"
 484 |    ]
 485 |   },
 486 |   {
 487 |    "cell_type": "code",
 488 |    "execution_count": 75,
 489 |    "metadata": {},
 490 |    "outputs": [
 491 |     {
 492 |      "data": {
 493 |       "text/markdown": [
 494 |        "\n",
 495 |        "**Question:** How do I specify conda dependencies in my flow?\n",
 496 |        "\n",
 497 |        "**Answer:** You can specify conda dependencies in your flow using the `@conda_base` and `@conda` decorators. The `@conda_base` decorator is used at the flow level to specify explicit library dependencies, python version, and whether to exclude all steps from executing within a conda environment. The `@conda` decorator is used at the step level to update the explicit library dependencies, python version, and conda environment exclusion as specified by the `@conda_base` decorator. By using these decorators, you can define the conda environment for each step in your flow. Additionally, you can add an explicit dependency on a specific module by using the `@conda` decorator in the corresponding step.\n"
 498 |       ],
 499 |       "text/plain": [
 500 |        "<IPython.core.display.Markdown object>"
 501 |       ]
 502 |      },
 503 |      "metadata": {},
 504 |      "output_type": "display_data"
 505 |     }
 506 |    ],
 507 |    "source": [
 508 |     "question = \"How do I specify conda dependencies in my flow?\"\n",
 509 |     "context = qa_iter(question, index)\n",
 510 |     "dmqa(question, context.response)"
 511 |    ]
 512 |   },
 513 |   {
 514 |    "cell_type": "markdown",
 515 |    "metadata": {},
 516 |    "source": [
 517 |     "### Do Metaflow docs know about `tmpfs` though?"
 518 |    ]
 519 |   },
 520 |   {
 521 |    "cell_type": "code",
 522 |    "execution_count": 76,
 523 |    "metadata": {},
 524 |    "outputs": [
 525 |     {
 526 |      "data": {
 527 |       "text/markdown": [
 528 |        "\n",
 529 |        "**Question:** How does Metaflow use `tmpfs`?\n",
 530 |        "\n",
 531 |        "**Answer:** Metaflow does not use `tmpfs` based on the given information.\n"
 532 |       ],
 533 |       "text/plain": [
 534 |        "<IPython.core.display.Markdown object>"
 535 |       ]
 536 |      },
 537 |      "metadata": {},
 538 |      "output_type": "display_data"
 539 |     }
 540 |    ],
 541 |    "source": [
 542 |     "question = \"How does Metaflow use `tmpfs`?\"\n",
 543 |     "dmqa(question, qa_iter(question, index).response)"
 544 |    ]
 545 |   },
 546 |   {
 547 |    "cell_type": "markdown",
 548 |    "metadata": {},
 549 |    "source": [
 550 |     "### Adding specific knowledge to the index\n",
 551 |     "\n",
 552 |     "As of August, 2023, the Metaflow documentation do not have much writing about `tmpfs`, so this makes sense.\n",
 553 |     "\n",
 554 |     "How can we add [Outerbounds blog post](https://outerbounds.com/blog/metaflow-fast-data/) that announced the `tmpfs` and Metaflow integration to the index, to give the model the context it needs to answer this question?\n",
 555 |     "\n",
 556 |     "Let's create an index that combines the one we saw earlier for the `tmpfs` blog post with the one we just created for Metaflow docs."
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "code",
 561 |    "execution_count": 78,
 562 |    "metadata": {},
 563 |    "outputs": [],
 564 |    "source": [
 565 |     "fast_data_file_path = os.path.abspath('%s/test-data/ob/blog/metaflow-fast-data.md' % DATA_DIR)\n",
 566 |     "\n",
 567 |     "# combining the document set\n",
 568 |     "fast_data_doc = get_documents_from_md_file_paths([fast_data_file_path])\n",
 569 |     "index_fast_data_post = VectorStoreIndex.from_documents(fast_data_doc)"
 570 |    ]
 571 |   },
 572 |   {
 573 |    "cell_type": "code",
 574 |    "execution_count": 79,
 575 |    "metadata": {},
 576 |    "outputs": [
 577 |     {
 578 |      "data": {
 579 |       "text/markdown": [
 580 |        "\n",
 581 |        "**Question:** How does Metaflow use `tmpfs`?\n",
 582 |        "\n",
 583 |        "**Answer:** Metaflow recently implemented support for memory-based `tmpfs` filesystem on AWS Batch and Kubernetes. This feature allows users to create an ephemeral filesystem backed by memory on the fly, without making any changes to the infrastructure. By enabling this feature using the `@batch(use_tmpfs=True)` decorator for AWS Batch workloads or `@kubernetes(use_tmpfs=True)` decorator for Kubernetes, the `metaflow.S3` client is automatically aware of the `tmpfs` volume and will use it to speed up downloads. This helps improve the performance of data downloads from S3 in Metaflow workflows.\n"
 584 |       ],
 585 |       "text/plain": [
 586 |        "<IPython.core.display.Markdown object>"
 587 |       ]
 588 |      },
 589 |      "metadata": {},
 590 |      "output_type": "display_data"
 591 |     }
 592 |    ],
 593 |    "source": [
 594 |     "# same question as above. now we can answer it with the new index.\n",
 595 |     "question = \"How does Metaflow use `tmpfs`?\"\n",
 596 |     "dmqa(question, qa_iter(question, index_fast_data_post).response)"
 597 |    ]
 598 |   },
 599 |   {
 600 |    "cell_type": "code",
 601 |    "execution_count": 81,
 602 |    "metadata": {},
 603 |    "outputs": [],
 604 |    "source": [
 605 |     "# indexes are updateable/composable! \n",
 606 |     "for doc_chunk in fast_data_doc:\n",
 607 |     "    index.insert(doc_chunk)"
 608 |    ]
 609 |   },
 610 |   {
 611 |    "cell_type": "code",
 612 |    "execution_count": 82,
 613 |    "metadata": {},
 614 |    "outputs": [
 615 |     {
 616 |      "data": {
 617 |       "text/markdown": [
 618 |        "\n",
 619 |        "**Question:** How does Metaflow use `tmpfs`?\n",
 620 |        "\n",
 621 |        "**Answer:** Metaflow uses `tmpfs` by implementing support for memory-based `tmpfs` filesystem on Batch and Kubernetes. This allows users to create an ephemeral filesystem backed by memory on the fly, without having to make any changes on the infrastructure side. When the `tmpfs` feature is enabled, the `metaflow.S3` client automatically uses it to speed up downloads. To enable this feature, users can add `@batch(use_tmpfs=True)` for AWS Batch workloads or `@kubernetes(use_tmpfs=True)` for Kubernetes in their Metaflow code.\n"
 622 |       ],
 623 |       "text/plain": [
 624 |        "<IPython.core.display.Markdown object>"
 625 |       ]
 626 |      },
 627 |      "metadata": {},
 628 |      "output_type": "display_data"
 629 |     },
 630 |     {
 631 |      "data": {
 632 |       "text/markdown": [
 633 |        "\n",
 634 |        "**Question:** How do I specify conda dependencies in my flow?\n",
 635 |        "\n",
 636 |        "**Answer:** You can specify conda dependencies in your flow using the `@conda_base` and `@conda` decorators. The `@conda_base` decorator is used at the flow level to specify explicit library dependencies, python version, and whether to exclude all steps from executing within a conda environment. The `@conda` decorator is used at the step level to update the explicit library dependencies, python version, and conda environment exclusion as specified by the `@conda_base` decorator. By using these decorators, you can define the conda environment for each step in your flow. Additionally, you can add an explicit dependency on a specific module by using the `@conda` decorator in the desired step.\n"
 637 |       ],
 638 |       "text/plain": [
 639 |        "<IPython.core.display.Markdown object>"
 640 |       ]
 641 |      },
 642 |      "metadata": {},
 643 |      "output_type": "display_data"
 644 |     }
 645 |    ],
 646 |    "source": [
 647 |     "# same questions as above. now we can answer it with the new index.\n",
 648 |     "question = \"How does Metaflow use `tmpfs`?\"\n",
 649 |     "dmqa(question, qa_iter(question, index).response)\n",
 650 |     "\n",
 651 |     "# and this one too.\n",
 652 |     "question = \"How do I specify conda dependencies in my flow?\"\n",
 653 |     "dmqa(question, qa_iter(question, index).response)"
 654 |    ]
 655 |   },
 656 |   {
 657 |    "cell_type": "markdown",
 658 |    "metadata": {},
 659 |    "source": [
 660 |     "# 3. Constructing an index based on sections of endorsed content we can link to\n",
 661 |     "[Parse the Documents into Nodes](https://gpt-index.readthedocs.io/en/latest/end_to_end_tutorials/usage_pattern.html#parse-the-documents-into-nodes)"
 662 |    ]
 663 |   },
 664 |   {
 665 |    "cell_type": "markdown",
 666 |    "metadata": {},
 667 |    "source": [
 668 |     "## Fetch all file paths of .md files"
 669 |    ]
 670 |   },
 671 |   {
 672 |    "cell_type": "code",
 673 |    "execution_count": 83,
 674 |    "metadata": {},
 675 |    "outputs": [
 676 |     {
 677 |      "name": "stdout",
 678 |      "output_type": "stream",
 679 |      "text": [
 680 |       "Looking for remote repository at https://github.com/Netflix/metaflow-docs\n",
 681 |       "Looking for remote repository at https://github.com/huggingface/accelerate\n"
 682 |      ]
 683 |     }
 684 |    ],
 685 |    "source": [
 686 |     "from rag_tools.filetypes.markdown import Mixin as mm\n",
 687 |     "# this cell is like a condensed version of `/flows/markdown_chunker.py`\n",
 688 |     "_mm = mm()\n",
 689 |     "_mm.repo_params = REPO_PARAMS\n",
 690 |     "\n",
 691 |     "# this is an unprocessed df, so you may want to clean it as /flows/data_table_processor.py does.\n",
 692 |     "df = _mm.load_df_from_repo_list()"
 693 |    ]
 694 |   },
 695 |   {
 696 |    "cell_type": "code",
 697 |    "execution_count": 84,
 698 |    "metadata": {},
 699 |    "outputs": [],
 700 |    "source": [
 701 |     "documents, ids = get_documents_from_content_section_df(df)\n",
 702 |     "df['doc_ids'] = ids"
 703 |    ]
 704 |   },
 705 |   {
 706 |    "cell_type": "code",
 707 |    "execution_count": 85,
 708 |    "metadata": {},
 709 |    "outputs": [],
 710 |    "source": [
 711 |     "index = VectorStoreIndex.from_documents(documents)"
 712 |    ]
 713 |   },
 714 |   {
 715 |    "cell_type": "code",
 716 |    "execution_count": 100,
 717 |    "metadata": {},
 718 |    "outputs": [
 719 |     {
 720 |      "data": {
 721 |       "text/markdown": [
 722 |        "The above instructions work even if you use [`@conda`\n",
 723 |        " decorators](/scaling/dependencies#managing-dependencies-with-conda-decorator) in your\n",
 724 |        " code; you need, however, to ensure that the `conda` binary is available in your `PATH`.\n",
 725 |        " The easiest way to do this is to set the `PATH` environment variable to properly include\n",
 726 |        " the path to the `conda` binary if it is in a non-standard location. In VSCode, you can\n",
 727 |        " simply add this value in the env section of launch.json and in PyCharm, the UI allows\n",
 728 |        " you to set environment variables."
 729 |       ],
 730 |       "text/plain": [
 731 |        "<IPython.core.display.Markdown object>"
 732 |       ]
 733 |      },
 734 |      "metadata": {},
 735 |      "output_type": "display_data"
 736 |     }
 737 |    ],
 738 |    "source": [
 739 |     "import random\n",
 740 |     "dm(random.choice(documents).text)"
 741 |    ]
 742 |   },
 743 |   {
 744 |    "cell_type": "markdown",
 745 |    "metadata": {},
 746 |    "source": [
 747 |     "# 4. Use Cases"
 748 |    ]
 749 |   },
 750 |   {
 751 |    "cell_type": "markdown",
 752 |    "metadata": {},
 753 |    "source": [
 754 |     "## Load a dataframe of text chunks and metadata from your latest workflow runs"
 755 |    ]
 756 |   },
 757 |   {
 758 |    "cell_type": "code",
 759 |    "execution_count": 6,
 760 |    "metadata": {},
 761 |    "outputs": [
 762 |     {
 763 |      "name": "stdout",
 764 |      "output_type": "stream",
 765 |      "text": [
 766 |       "1692847583518101\n"
 767 |      ]
 768 |     }
 769 |    ],
 770 |    "source": [
 771 |     "# find latest Metaflow run that saved processed df\n",
 772 |     "run = None\n",
 773 |     "for _run in Flow('DataTableProcessor'):\n",
 774 |     "    if _run.data.save_processed_df:\n",
 775 |     "        run = _run\n",
 776 |     "        break\n",
 777 |     "\n",
 778 |     "print(run.id)"
 779 |    ]
 780 |   },
 781 |   {
 782 |    "cell_type": "code",
 783 |    "execution_count": 7,
 784 |    "metadata": {},
 785 |    "outputs": [],
 786 |    "source": [
 787 |     "df = run.data.processed_df\n",
 788 |     "documents, ids = get_documents_from_content_section_df(df)\n",
 789 |     "df['doc_id'] = ids\n",
 790 |     "index = VectorStoreIndex(documents)"
 791 |    ]
 792 |   },
 793 |   {
 794 |    "cell_type": "markdown",
 795 |    "metadata": {},
 796 |    "source": [
 797 |     "## Q&A"
 798 |    ]
 799 |   },
 800 |   {
 801 |    "cell_type": "code",
 802 |    "execution_count": 12,
 803 |    "metadata": {},
 804 |    "outputs": [
 805 |     {
 806 |      "name": "stdout",
 807 |      "output_type": "stream",
 808 |      "text": [
 809 |       "['1', '1209']\n"
 810 |      ]
 811 |     },
 812 |     {
 813 |      "data": {
 814 |       "text/markdown": [
 815 |        "#### What is Metaflow?"
 816 |       ],
 817 |       "text/plain": [
 818 |        "<IPython.core.display.Markdown object>"
 819 |       ]
 820 |      },
 821 |      "metadata": {},
 822 |      "output_type": "display_data"
 823 |     },
 824 |     {
 825 |      "data": {
 826 |       "text/markdown": [
 827 |        "**Retrieved Response**"
 828 |       ],
 829 |       "text/plain": [
 830 |        "<IPython.core.display.Markdown object>"
 831 |       ]
 832 |      },
 833 |      "metadata": {},
 834 |      "output_type": "display_data"
 835 |     },
 836 |     {
 837 |      "data": {
 838 |       "text/markdown": [
 839 |        "Metaflow is a Python library that simplifies the development, deployment, and operation of data-intensive applications, particularly those related to data science and machine learning. It was initially created at Netflix to enhance the productivity of data scientists working on various projects. Metaflow is available as an open-source framework under the Apache License, Version 2.0. It allows data scientists to focus on important aspects like feature engineering and model development while abstracting away tasks such as job organization, orchestration, scheduling, and interaction with data warehouses. Additionally, Metaflow enables the building of production-ready machine learning workflows using a simple Python API and facilitates seamless transitions between local prototyping environments and cloud-based deployments."
 840 |       ],
 841 |       "text/plain": [
 842 |        "<IPython.core.display.Markdown object>"
 843 |       ]
 844 |      },
 845 |      "metadata": {},
 846 |      "output_type": "display_data"
 847 |     },
 848 |     {
 849 |      "data": {
 850 |       "text/markdown": [
 851 |        "#### Sources"
 852 |       ],
 853 |       "text/plain": [
 854 |        "<IPython.core.display.Markdown object>"
 855 |       ]
 856 |      },
 857 |      "metadata": {},
 858 |      "output_type": "display_data"
 859 |     },
 860 |     {
 861 |      "data": {
 862 |       "text/markdown": [
 863 |        "##### [What is Metaflow](https://docs.metaflow.org/introduction/what-is-metaflow#what-is-metaflow)"
 864 |       ],
 865 |       "text/plain": [
 866 |        "<IPython.core.display.Markdown object>"
 867 |       ]
 868 |      },
 869 |      "metadata": {},
 870 |      "output_type": "display_data"
 871 |     },
 872 |     {
 873 |      "data": {
 874 |       "text/markdown": [
 875 |        "Metaflow is a human-friendly Python library that makes it straightforward to develop, deploy, and op..."
 876 |       ],
 877 |       "text/plain": [
 878 |        "<IPython.core.display.Markdown object>"
 879 |       ]
 880 |      },
 881 |      "metadata": {},
 882 |      "output_type": "display_data"
 883 |     },
 884 |     {
 885 |      "data": {
 886 |       "text/markdown": [
 887 |        "##### [Reproducible ML pipelines with Metaflow](https://outerbounds.com/blog/machine-learning-pipelines-from-prototype-to-production#reproducible-ml-pipelines-with-metaflow)"
 888 |       ],
 889 |       "text/plain": [
 890 |        "<IPython.core.display.Markdown object>"
 891 |       ]
 892 |      },
 893 |      "metadata": {},
 894 |      "output_type": "display_data"
 895 |     },
 896 |     {
 897 |      "data": {
 898 |       "text/markdown": [
 899 |        "To help data scientists focus on the parts of the stack they really care about, such as feature engi..."
 900 |       ],
 901 |       "text/plain": [
 902 |        "<IPython.core.display.Markdown object>"
 903 |       ]
 904 |      },
 905 |      "metadata": {},
 906 |      "output_type": "display_data"
 907 |     }
 908 |    ],
 909 |    "source": [
 910 |     "question = \"What is Metaflow?\"\n",
 911 |     "generative_search_engine_iter(question, index, df)"
 912 |    ]
 913 |   },
 914 |   {
 915 |    "cell_type": "code",
 916 |    "execution_count": 108,
 917 |    "metadata": {},
 918 |    "outputs": [
 919 |     {
 920 |      "data": {
 921 |       "text/markdown": [
 922 |        "#### How does Metaflow work with Kubernetes?"
 923 |       ],
 924 |       "text/plain": [
 925 |        "<IPython.core.display.Markdown object>"
 926 |       ]
 927 |      },
 928 |      "metadata": {},
 929 |      "output_type": "display_data"
 930 |     },
 931 |     {
 932 |      "data": {
 933 |       "text/markdown": [
 934 |        "**Retrieved Response**"
 935 |       ],
 936 |       "text/plain": [
 937 |        "<IPython.core.display.Markdown object>"
 938 |       ]
 939 |      },
 940 |      "metadata": {},
 941 |      "output_type": "display_data"
 942 |     },
 943 |     {
 944 |      "data": {
 945 |       "text/markdown": [
 946 |        "Metaflow now has the capability to run on top of any Kubernetes cluster. Users can run all or parts of any Metaflow flow on Kubernetes from their workstation by using the command \"run --with kubernetes\". Additionally, users can deploy their flow to Argo Workflows, a Kubernetes-native workflow scheduler, with a single command \"argo-workflows create\" to execute the flow asynchronously. For more information on setting up and operating Kubernetes for Metaflow, users can refer to the engineering resources provided by Metaflow."
 947 |       ],
 948 |       "text/plain": [
 949 |        "<IPython.core.display.Markdown object>"
 950 |       ]
 951 |      },
 952 |      "metadata": {},
 953 |      "output_type": "display_data"
 954 |     },
 955 |     {
 956 |      "data": {
 957 |       "text/markdown": [
 958 |        "#### Sources"
 959 |       ],
 960 |       "text/plain": [
 961 |        "<IPython.core.display.Markdown object>"
 962 |       ]
 963 |      },
 964 |      "metadata": {},
 965 |      "output_type": "display_data"
 966 |     },
 967 |     {
 968 |      "data": {
 969 |       "text/markdown": [
 970 |        "##### [Add capability to launch Metaflow tasks on Kubernetes and schedule Metaflow flows with Argo Workflows.](https://docs.metaflow.org/internals/release-notes#add-capability-to-launch-metaflow-tasks-on-kubernetes-and-schedule-metaflow-flows-with-argo-workflows)"
 971 |       ],
 972 |       "text/plain": [
 973 |        "<IPython.core.display.Markdown object>"
 974 |       ]
 975 |      },
 976 |      "metadata": {},
 977 |      "output_type": "display_data"
 978 |     },
 979 |     {
 980 |      "data": {
 981 |       "text/markdown": [
 982 |        "This release enables brand new capabilities for [Metaflow on top of\n",
 983 |        " Kubernetes](https://outerbounds..."
 984 |       ],
 985 |       "text/plain": [
 986 |        "<IPython.core.display.Markdown object>"
 987 |       ]
 988 |      },
 989 |      "metadata": {},
 990 |      "output_type": "display_data"
 991 |     },
 992 |     {
 993 |      "data": {
 994 |       "text/markdown": [
 995 |        "##### [Using Kubernetes](https://docs.metaflow.org/scaling/remote-tasks/kubernetes#using-kubernetes)"
 996 |       ],
 997 |       "text/plain": [
 998 |        "<IPython.core.display.Markdown object>"
 999 |       ]
1000 |      },
1001 |      "metadata": {},
1002 |      "output_type": "display_data"
1003 |     },
1004 |     {
1005 |      "data": {
1006 |       "text/markdown": [
1007 |        "Here are some useful tips and tricks related to running Metaflow on Kubernetes. See our\n",
1008 |        " engineering..."
1009 |       ],
1010 |       "text/plain": [
1011 |        "<IPython.core.display.Markdown object>"
1012 |       ]
1013 |      },
1014 |      "metadata": {},
1015 |      "output_type": "display_data"
1016 |     }
1017 |    ],
1018 |    "source": [
1019 |     "question = \"How does Metaflow work with Kubernetes?\"\n",
1020 |     "generative_search_engine_iter(question, index, df)"
1021 |    ]
1022 |   },
1023 |   {
1024 |    "cell_type": "code",
1025 |    "execution_count": 109,
1026 |    "metadata": {},
1027 |    "outputs": [
1028 |     {
1029 |      "data": {
1030 |       "text/markdown": [
1031 |        "#### What is a DAG?"
1032 |       ],
1033 |       "text/plain": [
1034 |        "<IPython.core.display.Markdown object>"
1035 |       ]
1036 |      },
1037 |      "metadata": {},
1038 |      "output_type": "display_data"
1039 |     },
1040 |     {
1041 |      "data": {
1042 |       "text/markdown": [
1043 |        "**Retrieved Response**"
1044 |       ],
1045 |       "text/plain": [
1046 |        "<IPython.core.display.Markdown object>"
1047 |       ]
1048 |      },
1049 |      "metadata": {},
1050 |      "output_type": "display_data"
1051 |     },
1052 |     {
1053 |      "data": {
1054 |       "text/markdown": [
1055 |        "A DAG, or directed acyclic graph, is a graph that consists of nodes connected by directed edges, where the edges have a specific direction and there are no cycles in the graph. In the context of Metaflow, a DAG is inferred based on the transitions between step functions. The nodes in the DAG are the steps, which represent operations, and the edges represent the transitions between steps. The DAG structure is important for defining the flow of execution and dependencies between steps in Metaflow."
1056 |       ],
1057 |       "text/plain": [
1058 |        "<IPython.core.display.Markdown object>"
1059 |       ]
1060 |      },
1061 |      "metadata": {},
1062 |      "output_type": "display_data"
1063 |     },
1064 |     {
1065 |      "data": {
1066 |       "text/markdown": [
1067 |        "#### Sources"
1068 |       ],
1069 |       "text/plain": [
1070 |        "<IPython.core.display.Markdown object>"
1071 |       ]
1072 |      },
1073 |      "metadata": {},
1074 |      "output_type": "display_data"
1075 |     },
1076 |     {
1077 |      "data": {
1078 |       "text/markdown": [
1079 |        "##### [Graph](https://docs.metaflow.org/internals/technical-overview#graph)"
1080 |       ],
1081 |       "text/plain": [
1082 |        "<IPython.core.display.Markdown object>"
1083 |       ]
1084 |      },
1085 |      "metadata": {},
1086 |      "output_type": "display_data"
1087 |     },
1088 |     {
1089 |      "data": {
1090 |       "text/markdown": [
1091 |        "Metaflow infers a directed (typically acyclic) graph based on the transitions between\n",
1092 |        " step function..."
1093 |       ],
1094 |       "text/plain": [
1095 |        "<IPython.core.display.Markdown object>"
1096 |       ]
1097 |      },
1098 |      "metadata": {},
1099 |      "output_type": "display_data"
1100 |     },
1101 |     {
1102 |      "data": {
1103 |       "text/markdown": [
1104 |        "##### [The Structure of Metaflow Code](https://docs.metaflow.org/metaflow/basics#the-structure-of-metaflow-code)"
1105 |       ],
1106 |       "text/plain": [
1107 |        "<IPython.core.display.Markdown object>"
1108 |       ]
1109 |      },
1110 |      "metadata": {},
1111 |      "output_type": "display_data"
1112 |     },
1113 |     {
1114 |      "data": {
1115 |       "text/markdown": [
1116 |        "Metaflow follows [the dataflow\n",
1117 |        " paradigm](https://en.wikipedia.org/wiki/Dataflow_programming) which ..."
1118 |       ],
1119 |       "text/plain": [
1120 |        "<IPython.core.display.Markdown object>"
1121 |       ]
1122 |      },
1123 |      "metadata": {},
1124 |      "output_type": "display_data"
1125 |     }
1126 |    ],
1127 |    "source": [
1128 |     "question = \"What is a DAG?\"\n",
1129 |     "generative_search_engine_iter(question, index, df)"
1130 |    ]
1131 |   },
1132 |   {
1133 |    "cell_type": "markdown",
1134 |    "metadata": {},
1135 |    "source": [
1136 |     "## RAG for a generic sales pitch "
1137 |    ]
1138 |   },
1139 |   {
1140 |    "cell_type": "markdown",
1141 |    "metadata": {},
1142 |    "source": [
1143 |     "### A custom prompt template in pure Python\n",
1144 |     "Many tools exist to make prompts easy to manage. [Langchain](https://www.langchain.com/) is an emerging leader in this space."
1145 |    ]
1146 |   },
1147 |   {
1148 |    "cell_type": "code",
1149 |    "execution_count": 126,
1150 |    "metadata": {},
1151 |    "outputs": [],
1152 |    "source": [
1153 |     "prompt_template = \"\"\"\n",
1154 |     "Write an introduction email to a potential technical user who works as a {role} at {company}.\n",
1155 |     "\n",
1156 |     "Do not directly address the user's role or company anywhere in the email.\n",
1157 |     "\n",
1158 |     "Write the email for a technical audience who doesn't want to read marketing copy.\n",
1159 |     "\n",
1160 |     "Highlight Metaflow features related to their interests including {interests}.\n",
1161 |     "\n",
1162 |     "Include a summary motivating the benefits of these features by summarizing this context about Metaflow:\n",
1163 |     "{context_about_interests}\n",
1164 |     "\n",
1165 |     "Make a subtle reference that Outerbounds platform can help them with {enterprise_platform_interest_hook}.\n",
1166 |     "\n",
1167 |     "Include a summary motivating the benefits of Outerbounds platform by summarizing this context about Outerbounds platform:\n",
1168 |     "{context_about_enterprise_platform_interest_hook}\n",
1169 |     "\n",
1170 |     "Make the CTA to schedule a meeting to discuss how Outerbounds platform can help them.\n",
1171 |     "\n",
1172 |     "Make the email as short as possible. \n",
1173 |     "\n",
1174 |     "Do not reference your own profession or any experiences. Do not talk about yourself.\n",
1175 |     "\n",
1176 |     "Do not explicitly reference the company that the receiver works for. Only implicitly use this knowledge to demonstrate knowledge of the problems their organization may face.\n",
1177 |     "\n",
1178 |     "Avoid speaking from the first person.\n",
1179 |     "\n",
1180 |     "Avoid directly saying that you know about anyone's past experience or background. \n",
1181 |     "\n",
1182 |     "Avoid saying anything with similar sentiment to these statements:\n",
1183 |     "    <br/> &ensp;- Author Metaflow flows using notebooks\n",
1184 |     "    <br/> &ensp;- Refer to an interest listed above as something Metaflow works with\n",
1185 |     "\"\"\""
1186 |    ]
1187 |   },
1188 |   {
1189 |    "cell_type": "markdown",
1190 |    "metadata": {},
1191 |    "source": [
1192 |     "### Prompt engineering 101\n",
1193 |     "Inject some relevant context into our prompt template."
1194 |    ]
1195 |   },
1196 |   {
1197 |    "cell_type": "code",
1198 |    "execution_count": 127,
1199 |    "metadata": {},
1200 |    "outputs": [],
1201 |    "source": [
1202 |     "things_we_learned_about_a_prospect = dict(\n",
1203 |     "    role=\"data scientist\",\n",
1204 |     "    company=\"Big Industries Co.\",\n",
1205 |     "\n",
1206 |     "    # comma-separated lists\n",
1207 |     "    interests=\"mlops, deep learning, kubernetes\",\n",
1208 |     "    enterprise_platform_interest_hook=\"CI/CD, security\",\n",
1209 |     ")"
1210 |    ]
1211 |   },
1212 |   {
1213 |    "cell_type": "markdown",
1214 |    "metadata": {},
1215 |    "source": [
1216 |     "### RAGify! \n",
1217 |     "Here we use the Q&A iteration you saw in previous sections, but instead of printing the results we are using them to \"augment\" the prompt."
1218 |    ]
1219 |   },
1220 |   {
1221 |    "cell_type": "code",
1222 |    "execution_count": 128,
1223 |    "metadata": {},
1224 |    "outputs": [
1225 |     {
1226 |      "name": "stderr",
1227 |      "output_type": "stream",
1228 |      "text": [
1229 |       "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:27<00:00,  9.04s/it]\n",
1230 |       "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:20<00:00, 10.34s/it]\n"
1231 |      ]
1232 |     }
1233 |    ],
1234 |    "source": [
1235 |     "# RAG step - fetch some context\n",
1236 |     "from tqdm import tqdm\n",
1237 |     "\n",
1238 |     "interest_list = things_we_learned_about_a_prospect[\"interests\"].split(\", \")\n",
1239 |     "context_about_interests = []\n",
1240 |     "for interest in tqdm(interest_list):\n",
1241 |     "    context_about_interest = qa_iter(\n",
1242 |     "        f\"Describe how Metaflow and {interest} can be used together in ML workflows. Focus on Metaflow being used as a complimentary tool.\",\n",
1243 |     "        index,\n",
1244 |     "    )\n",
1245 |     "    context_about_interests.append(context_about_interest)\n",
1246 |     "\n",
1247 |     "enterprise_platform_interest_hook = things_we_learned_about_a_prospect[\n",
1248 |     "    \"enterprise_platform_interest_hook\"\n",
1249 |     "].split(\", \")\n",
1250 |     "context_about_enterprise_platform_interest_hook = []\n",
1251 |     "for interest in tqdm(enterprise_platform_interest_hook):\n",
1252 |     "    context_about_interest = qa_iter(\n",
1253 |     "        f\"Describe how Metaflow and {interest} can be used together in ML workflows. Focus on Metaflow being used as a complimentary tool.\",\n",
1254 |     "        index,\n",
1255 |     "    )\n",
1256 |     "    context_about_enterprise_platform_interest_hook.append(context_about_interest)"
1257 |    ]
1258 |   },
1259 |   {
1260 |    "cell_type": "code",
1261 |    "execution_count": 129,
1262 |    "metadata": {},
1263 |    "outputs": [],
1264 |    "source": [
1265 |     "# unpack context injection data to encourage the generative LLM to embed the links in its Markdown response.\n",
1266 |     "\n",
1267 |     "def prepare_with_links(context_list: List[Context]):\n",
1268 |     "    def strip_numbers_and_punctuation(header):\n",
1269 |     "        import string\n",
1270 |     "        import re\n",
1271 |     "\n",
1272 |     "        return re.sub(\n",
1273 |     "            r\"\\d\", \"\", header.translate(str.maketrans(\"\", \"\", string.punctuation))\n",
1274 |     "        )\n",
1275 |     "\n",
1276 |     "    context_to_inject = \"\"\"\n",
1277 |     "The results will be given to you in a format like\n",
1278 |     "<br/> &ensp;- \"Text\": [Comma-separated list of [Link Label](Link URL)]\n",
1279 |     "Use the [Link Label](Link URL) syntax in the summary, and use the links in context of the paragraph.\n",
1280 |     "\n",
1281 |     "These are the results to summarize:\n",
1282 |     "\"\"\"\n",
1283 |     "    for _context in context_list:\n",
1284 |     "        similar_chunk_df = _context.get_link_df(df)\n",
1285 |     "        links = []\n",
1286 |     "        for header, url in list(\n",
1287 |     "            zip(similar_chunk_df.header.values, similar_chunk_df.page_url.values)\n",
1288 |     "        ):\n",
1289 |     "            links.append(f\"[{strip_numbers_and_punctuation(header)}]({url})\")\n",
1290 |     "        context_to_inject += (\n",
1291 |     "            \"<br/>\" + \" &ensp;-\" + ' \"' + _context.response + '\": ' + \", \".join(links)\n",
1292 |     "        )\n",
1293 |     "    return context_to_inject"
1294 |    ]
1295 |   },
1296 |   {
1297 |    "cell_type": "code",
1298 |    "execution_count": 130,
1299 |    "metadata": {},
1300 |    "outputs": [],
1301 |    "source": [
1302 |     "prompt = prompt_template.format(\n",
1303 |     "    **things_we_learned_about_a_prospect,\n",
1304 |     "    context_about_interests=prepare_with_links(context_about_interests),\n",
1305 |     "    context_about_enterprise_platform_interest_hook=prepare_with_links(\n",
1306 |     "        context_about_enterprise_platform_interest_hook\n",
1307 |     "    )\n",
1308 |     ")"
1309 |    ]
1310 |   },
1311 |   {
1312 |    "cell_type": "code",
1313 |    "execution_count": 131,
1314 |    "metadata": {},
1315 |    "outputs": [
1316 |     {
1317 |      "name": "stdout",
1318 |      "output_type": "stream",
1319 |      "text": [
1320 |       "Enter a negative sentiment to avoid by append to the prompt with an 'avoid' instruction, or press enter to continue, or type r then enter to restart:  \n"
1321 |      ]
1322 |     }
1323 |    ],
1324 |    "source": [
1325 |     "# append negative sentiment escapes to prompt.\n",
1326 |     "# treat these like UX escape hatches, where you can always just stuff extra things in, until you reach the model's context width.\n",
1327 |     "user_interaction = True\n",
1328 |     "negative_sentiment_context_lines = []\n",
1329 |     "while user_interaction:\n",
1330 |     "\n",
1331 |     "    if len(negative_sentiment_context_lines) == 0:\n",
1332 |     "        user_input = input(\n",
1333 |     "            \"Enter a negative sentiment to avoid by append to the prompt with an 'avoid' instruction, or press enter to continue, or type r then enter to restart: \"\n",
1334 |     "        )\n",
1335 |     "    else:\n",
1336 |     "        print(\"Current list of negative sentiment statements:\")\n",
1337 |     "        for line in negative_sentiment_context_lines:\n",
1338 |     "            print(line)\n",
1339 |     "        user_input = input(\n",
1340 |     "            \"Single enter to continue, double enter to complete list of results of negative sentiment statements.\"\n",
1341 |     "        )\n",
1342 |     "    # process user input\n",
1343 |     "    if user_input == \"\":\n",
1344 |     "        user_interaction = False\n",
1345 |     "    elif user_input == \"r\":\n",
1346 |     "        negative_sentiment_context_lines = []\n",
1347 |     "    else:\n",
1348 |     "        negative_sentiment_context_lines.append(user_input)\n",
1349 |     "\n",
1350 |     "for line in negative_sentiment_context_lines:\n",
1351 |     "    prompt += \"<br/>\" + \" &ensp;-\" + \" \" + line"
1352 |    ]
1353 |   },
1354 |   {
1355 |    "cell_type": "markdown",
1356 |    "metadata": {},
1357 |    "source": [
1358 |     "### View the final prompt the model will see"
1359 |    ]
1360 |   },
1361 |   {
1362 |    "cell_type": "code",
1363 |    "execution_count": 132,
1364 |    "metadata": {},
1365 |    "outputs": [
1366 |     {
1367 |      "data": {
1368 |       "text/markdown": [
1369 |        "##### The first five hundred chars"
1370 |       ],
1371 |       "text/plain": [
1372 |        "<IPython.core.display.Markdown object>"
1373 |       ]
1374 |      },
1375 |      "metadata": {},
1376 |      "output_type": "display_data"
1377 |     },
1378 |     {
1379 |      "data": {
1380 |       "text/markdown": [
1381 |        "\n",
1382 |        "Write an introduction email to a potential technical user who works as a data scientist at Big Industries Co..\n",
1383 |        "\n",
1384 |        "Do not directly address the user's role or company anywhere in the email.\n",
1385 |        "\n",
1386 |        "Write the email for a technical audience who doesn't want to read marketing copy.\n",
1387 |        "\n",
1388 |        "Highlight Metaflow features related to their interests including mlops, deep learning, kubernetes.\n",
1389 |        "\n",
1390 |        "Include a summary motivating the benefits of these features by summarizing this context about Metaflow:\n",
1391 |        "\n",
1392 |        "The results will be give..."
1393 |       ],
1394 |       "text/plain": [
1395 |        "<IPython.core.display.Markdown object>"
1396 |       ]
1397 |      },
1398 |      "metadata": {},
1399 |      "output_type": "display_data"
1400 |     }
1401 |    ],
1402 |    "source": [
1403 |     "dm(\"##### The first five hundred chars\") \n",
1404 |     "dm(prompt[:500] + '...')"
1405 |    ]
1406 |   },
1407 |   {
1408 |    "cell_type": "markdown",
1409 |    "metadata": {},
1410 |    "source": [
1411 |     "### See your RAG app in action! "
1412 |    ]
1413 |   },
1414 |   {
1415 |    "cell_type": "code",
1416 |    "execution_count": 133,
1417 |    "metadata": {},
1418 |    "outputs": [
1419 |     {
1420 |      "data": {
1421 |       "text/markdown": [
1422 |        "##### Generated email"
1423 |       ],
1424 |       "text/plain": [
1425 |        "<IPython.core.display.Markdown object>"
1426 |       ]
1427 |      },
1428 |      "metadata": {},
1429 |      "output_type": "display_data"
1430 |     },
1431 |     {
1432 |      "data": {
1433 |       "text/markdown": [
1434 |        "Subject: Enhancing ML Workflows with Metaflow and Outerbounds Platform\n",
1435 |        "\n",
1436 |        "Dear [Recipient],\n",
1437 |        "\n",
1438 |        "I hope this email finds you well. I wanted to reach out to introduce you to Metaflow, a powerful tool that can enhance your data science workflows and help you achieve scalability, reproducibility, and production-readiness.\n",
1439 |        "\n",
1440 |        "Metaflow can be used as a complimentary tool in ML workflows, particularly when combined with MLOps practices. By integrating Metaflow into your pipeline, you can leverage its capabilities to build and deploy ML models more efficiently. It takes care of low-level infrastructure such as data, compute, orchestration, and versioning, allowing you to focus on the fun parts of building applications and models. [Metaflow takes care of the plumbing so you can focus on the fun parts](https://docs.metaflow.org/introduction/why-metaflow#10-metaflow-takes-care-of-the-plumbing-so-you-can-focus-on-the-fun-parts).\n",
1441 |        "\n",
1442 |        "For deep learning projects, Metaflow provides a robust and user-friendly foundation. It covers the full stack of DS/ML infrastructure, allowing you to focus on iterating on ideas quickly and deploying them confidently. [Metaflow covers the full stack of DS/ML infrastructure](https://docs.metaflow.org/introduction/why-metaflow#9-metaflow-covers-the-full-stack-of-ds-ml-infrastructure).\n",
1443 |        "\n",
1444 |        "If you're working with Kubernetes, Metaflow seamlessly integrates with it to leverage scalable infrastructure for running ML/DS applications. This makes it suitable for both small and large organizations. [Metaflow relies on systems that engineers know and trust](https://docs.metaflow.org/introduction/why-metaflow#11-metaflow-relies-on-systems-that-engineers-know-and-trust).\n",
1445 |        "\n",
1446 |        "In addition to Metaflow, I wanted to mention Outerbounds Platform, which can further enhance your ML workflows. It offers CI/CD capabilities, ensuring that changes to ML models and data pipelines are thoroughly tested and deployed in a reliable and efficient manner. Outerbounds Platform also prioritizes security, respecting your company's security policies and providing a secure environment for executing data science projects.\n",
1447 |        "\n",
1448 |        "I would love to schedule a meeting to discuss how Metaflow and Outerbounds Platform can specifically benefit your organization and address any challenges you may be facing. Please let me know a time that works for you, and I will be happy to set up a call.\n",
1449 |        "\n",
1450 |        "Looking forward to hearing from you.\n",
1451 |        "\n",
1452 |        "Best regards,\n",
1453 |        "[Your Name]"
1454 |       ],
1455 |       "text/plain": [
1456 |        "<IPython.core.display.Markdown object>"
1457 |       ]
1458 |      },
1459 |      "metadata": {},
1460 |      "output_type": "display_data"
1461 |     }
1462 |    ],
1463 |    "source": [
1464 |     "query_engine = index.as_query_engine()\n",
1465 |     "query_res = query_engine.query(prompt)\n",
1466 |     "response = query_res.response\n",
1467 |     "dm(\"##### Generated email\")\n",
1468 |     "dm(response)"
1469 |    ]
1470 |   },
1471 |   {
1472 |    "cell_type": "markdown",
1473 |    "metadata": {},
1474 |    "source": [
1475 |     "## Chatbot"
1476 |    ]
1477 |   },
1478 |   {
1479 |    "cell_type": "code",
1480 |    "execution_count": 33,
1481 |    "metadata": {},
1482 |    "outputs": [],
1483 |    "source": [
1484 |     "repos = [\n",
1485 |     "    {\n",
1486 |     "        \"deployment_url\": \"docs.metaflow.org\",\n",
1487 |     "        \"repository_path\": os.path.expanduser(\"~/Dev/metaflow-docs\"),\n",
1488 |     "        \"repository_ref\": \"master\",\n",
1489 |     "        \"base_search_path\": \"docs\",\n",
1490 |     "        \"exclude_paths\": [\"docs/v\"],\n",
1491 |     "        \"exclude_files\": [\"README.md\", \"README\"],\n",
1492 |     "    }\n",
1493 |     "]"
1494 |    ]
1495 |   },
1496 |   {
1497 |    "cell_type": "code",
1498 |    "execution_count": 34,
1499 |    "metadata": {},
1500 |    "outputs": [],
1501 |    "source": [
1502 |     "def local_repos_to_docs(repos):\n",
1503 |     "    documents = []\n",
1504 |     "    for params in repos:\n",
1505 |     "        md_files = DocumentationExtractor().filter_files(\n",
1506 |     "            params[\"repository_path\"],\n",
1507 |     "            base_search_path=params[\"base_search_path\"],\n",
1508 |     "            exclude_paths=params[\"exclude_paths\"],\n",
1509 |     "            exclude_files=params[\"exclude_files\"],\n",
1510 |     "            considered_extensions=[\".md\"]\n",
1511 |     "        )\n",
1512 |     "        md_files = [abs_path for abs_path, _ in md_files]\n",
1513 |     "        documents += get_documents_from_md_file_paths(md_files)\n",
1514 |     "    return documents"
1515 |    ]
1516 |   },
1517 |   {
1518 |    "cell_type": "code",
1519 |    "execution_count": 35,
1520 |    "metadata": {},
1521 |    "outputs": [
1522 |     {
1523 |      "data": {
1524 |       "text/plain": [
1525 |        "[Document(id_='818c7d59-7d59-4c9a-8e26-be8fa66929f6', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='168707b49cebf5424ed48615defbba683a0d7932f8451d263d491040c37aa87f', text='\\n\\nWelcome to Metaflow\\n\\nMetaflow makes it easy to build and manage real-life data science and machine learning projects.\\n\\n\\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n",
1526 |        " Document(id_='baebefe9-9cf4-4c6b-8831-3fec827af036', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='0c733db8a7e3a129b3c47977e53aa8d0ee5e0942e768f40d822a65e1da4f9aea', text='\\n\\nMotivation\\n\\n- Why Metaflow\\n- What is Metaflow\\n- Metaflow Resources\\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]"
1527 |       ]
1528 |      },
1529 |      "execution_count": 35,
1530 |      "metadata": {},
1531 |      "output_type": "execute_result"
1532 |     }
1533 |    ],
1534 |    "source": [
1535 |     "documents = local_repos_to_docs(repos)\n",
1536 |     "documents[:2]"
1537 |    ]
1538 |   },
1539 |   {
1540 |    "cell_type": "code",
1541 |    "execution_count": 36,
1542 |    "metadata": {},
1543 |    "outputs": [],
1544 |    "source": [
1545 |     "temp = 0.0\n",
1546 |     "model = \"gpt-3.5-turbo\"\n",
1547 |     "chat_mode = \"react\""
1548 |    ]
1549 |   },
1550 |   {
1551 |    "cell_type": "code",
1552 |    "execution_count": 37,
1553 |    "metadata": {},
1554 |    "outputs": [],
1555 |    "source": [
1556 |     "service_context = ServiceContext.from_defaults(llm=OpenAI(model=model, temperature=temp))\n",
1557 |     "index = VectorStoreIndex.from_documents(documents, service_context=service_context)"
1558 |    ]
1559 |   },
1560 |   {
1561 |    "cell_type": "code",
1562 |    "execution_count": 38,
1563 |    "metadata": {},
1564 |    "outputs": [],
1565 |    "source": [
1566 |     "chat_engine = index.as_chat_engine(service_context=service_context, chat_mode=chat_mode, verbose=True)"
1567 |    ]
1568 |   },
1569 |   {
1570 |    "cell_type": "code",
1571 |    "execution_count": 48,
1572 |    "metadata": {},
1573 |    "outputs": [
1574 |     {
1575 |      "name": "stdout",
1576 |      "output_type": "stream",
1577 |      "text": [
1578 |       "\u001b[38;5;200m\u001b[1;3mThought: I need to use a tool to help me answer the question.\n",
1579 |       "Action: query_engine_tool\n",
1580 |       "Action Input: {'input': 'How does Metaflow help AI developers?'}\n",
1581 |       "\u001b[0m\u001b[36;1m\u001b[1;3mObservation: Metaflow helps AI developers by providing a robust and user-friendly foundation for data-intensive applications. It takes care of the low-level infrastructure such as data, compute, orchestration, and versioning, allowing developers to focus on building their own applications, models, and policies on top of it. This means that AI developers who have a basic understanding of Python can leverage Metaflow to streamline their development process and focus on the more enjoyable aspects of their work.\n",
1582 |       "\u001b[0m"
1583 |      ]
1584 |     }
1585 |    ],
1586 |    "source": [
1587 |     "response = chat_engine.chat(\"How does Metaflow help AI developers?\")"
1588 |    ]
1589 |   },
1590 |   {
1591 |    "cell_type": "code",
1592 |    "execution_count": 34,
1593 |    "metadata": {},
1594 |    "outputs": [
1595 |     {
1596 |      "data": {
1597 |       "text/markdown": [
1598 |        "Metaflow helps AI developers by providing a robust and user-friendly foundation for data-intensive applications. It takes care of the low-level infrastructure such as data, compute, orchestration, and versioning, allowing developers to focus on building their own applications, models, and policies on top of it. This means that AI developers who have a basic understanding of Python can leverage Metaflow to streamline their development process and focus on the more enjoyable aspects of their work."
1599 |       ],
1600 |       "text/plain": [
1601 |        "<IPython.core.display.Markdown object>"
1602 |       ]
1603 |      },
1604 |      "metadata": {},
1605 |      "output_type": "display_data"
1606 |     }
1607 |    ],
1608 |    "source": [
1609 |     "dm(response.response)"
1610 |    ]
1611 |   },
1612 |   {
1613 |    "cell_type": "code",
1614 |    "execution_count": 36,
1615 |    "metadata": {},
1616 |    "outputs": [
1617 |     {
1618 |      "name": "stdout",
1619 |      "output_type": "stream",
1620 |      "text": [
1621 |       "\u001b[38;5;200m\u001b[1;3mThought: I have already answered this question. I can provide a summary of my previous response to help answer the question again.\n",
1622 |       "Action: query_engine_tool\n",
1623 |       "Action Input: {'input': \"The enjoyable aspects of AI developers' work\"}\n",
1624 |       "\u001b[0m\u001b[36;1m\u001b[1;3mObservation: The enjoyable aspects of AI developers' work include the ability to create value with machine learning, similar to traditional software engineering. The development and deployment of ML systems are expected to mature and improve over time, just as software development has done in the past 20 years. Efforts from various entities, such as governments, open source communities, and for-profit companies, are being made to articulate risks, develop best practices, and provide tooling to support ML development. This indicates a positive prognosis for the future of AI development.\n",
1625 |       "\u001b[0m\u001b[38;5;200m\u001b[1;3mResponse: The enjoyable aspects of AI developers' work include the ability to create value with machine learning, the opportunity for continuous learning and staying updated with the latest advancements, the potential for impactful applications, collaboration with interdisciplinary teams, the freedom to experiment and innovate, and the automation of repetitive tasks. Additionally, efforts from various entities are being made to support and improve the development and deployment of ML systems, indicating a positive prognosis for the future of AI development.\n",
1626 |       "\u001b[0m"
1627 |      ]
1628 |     },
1629 |     {
1630 |      "data": {
1631 |       "text/markdown": [
1632 |        "The enjoyable aspects of AI developers' work include the ability to create value with machine learning, the opportunity for continuous learning and staying updated with the latest advancements, the potential for impactful applications, collaboration with interdisciplinary teams, the freedom to experiment and innovate, and the automation of repetitive tasks. Additionally, efforts from various entities are being made to support and improve the development and deployment of ML systems, indicating a positive prognosis for the future of AI development."
1633 |       ],
1634 |       "text/plain": [
1635 |        "<IPython.core.display.Markdown object>"
1636 |       ]
1637 |      },
1638 |      "metadata": {},
1639 |      "output_type": "display_data"
1640 |     }
1641 |    ],
1642 |    "source": [
1643 |     "response = chat_engine.chat(\"What are the enjoyable aspects of their work?\")\n",
1644 |     "dm(response.response)"
1645 |    ]
1646 |   },
1647 |   {
1648 |    "cell_type": "code",
1649 |    "execution_count": 69,
1650 |    "metadata": {},
1651 |    "outputs": [],
1652 |    "source": [
1653 |     "# chat_engine.chat_repl()"
1654 |    ]
1655 |   }
1656 |  ],
1657 |  "metadata": {
1658 |   "kernelspec": {
1659 |    "display_name": "RAG demo",
1660 |    "language": "python",
1661 |    "name": "rag-demo"
1662 |   },
1663 |   "language_info": {
1664 |    "codemirror_mode": {
1665 |     "name": "ipython",
1666 |     "version": 3
1667 |    },
1668 |    "file_extension": ".py",
1669 |    "mimetype": "text/x-python",
1670 |    "name": "python",
1671 |    "nbconvert_exporter": "python",
1672 |    "pygments_lexer": "ipython3",
1673 |    "version": "3.11.4"
1674 |   }
1675 |  },
1676 |  "nbformat": 4,
1677 |  "nbformat_minor": 4
1678 | }
1679 | 


--------------------------------------------------------------------------------
/notebooks/markdown_repo_explorer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Extending these workflows to other repositories\n",
  8 |     "**IMPORTANT**: Please be respectful of understanding the licenses of repositories you choose to crawl with any tools, including those provided in this demonstration. This repository is intended to help you quickly get started indexing documentation in GitHub repositories, not for large-scale web scraping."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "## The main tool"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {
 22 |     "vscode": {
 23 |      "languageId": "plaintext"
 24 |     }
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "from rag_tools.repo.ops import DocumentationExtractor"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "The `DocumentationExtractor` has two functions, `filter_files` and `extract`."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Finding files to chunk\n",
 43 |     "First, a set of parameters defining how to crawl a GitHub repository is passed to a custom object defined in `rag_tools.repo.ops.DocumentationExtractor`. One option is to clone a repository and pass a local path to `filter_files` to return all the paths to the `considered_extensions`"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 7,
 49 |    "metadata": {
 50 |     "vscode": {
 51 |      "languageId": "plaintext"
 52 |     }
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "import os\n",
 57 |     "METAFLOW_DOCS_REPO_PATH = os.path.expanduser(\"~/Dev/metaflow-docs\")\n",
 58 |     "file_paths = DocumentationExtractor().filter_files(\n",
 59 |     "    METAFLOW_DOCS_REPO_PATH,\n",
 60 |     "    base_search_path = \"docs\",\n",
 61 |     "    exclude_paths = [\"docs/v\"],\n",
 62 |     "    exclude_files = [\"README.md\", \"README\"],\n",
 63 |     "    considered_extensions = [\".md\"],\n",
 64 |     ")"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 9,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/plain": [
 75 |        "(58, ('/Users/eddie/Dev/metaflow-docs/docs/index.md', 'index.md'))"
 76 |       ]
 77 |      },
 78 |      "execution_count": 9,
 79 |      "metadata": {},
 80 |      "output_type": "execute_result"
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "len(file_paths), file_paths[0]"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "Another workflow option is to only download the repository temporarily by passing the remote repositories URL to `extract`, and letting this function handle the `filter_files` call and any other parsing internally."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 19,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import pandas as pd\n",
101 |     "from functools import partial\n",
102 |     "from rag_tools.filetypes.markdown import parse_md_file_headers\n",
103 |     "\n",
104 |     "# we will pass this to the extract(parser=...) arg. \n",
105 |     "# you can replace this with any function that takes in filepath and filename args.\n",
106 |     "def headers_df_parser(filepath, filename, deployment_url, subdir) -> pd.DataFrame:\n",
107 |     "    return parse_md_file_headers(filepath, deployment_url=deployment_url, subdir=subdir)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 20,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "params = {\n",
117 |     "    \"deployment_url\": \"docs.metaflow.org\",\n",
118 |     "    \"repository_path\": \"https://github.com/Netflix/metaflow-docs\",\n",
119 |     "    \"repository_ref\": \"master\",\n",
120 |     "    \"base_search_path\": \"docs\",\n",
121 |     "    \"exclude_paths\": [\"docs/v\"],\n",
122 |     "    \"exclude_files\": [\"README.md\", \"README\"],\n",
123 |     "}"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 21,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "extractor = DocumentationExtractor(repo_url=params[\"repository_path\"])\n",
133 |     "\n",
134 |     "dfs = extractor.extract(\n",
135 |     "    base_path=params[\"base_search_path\"],\n",
136 |     "    ref=params[\"repository_ref\"],\n",
137 |     "    exclude_paths=params[\"exclude_paths\"],\n",
138 |     "    exclude_files=params[\"exclude_files\"],\n",
139 |     "    considered_extensions=[\".md\"],\n",
140 |     "    parser=partial(headers_df_parser, deployment_url=params['deployment_url'], subdir=params['base_search_path']),\n",
141 |     ")"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 24,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "df = pd.concat(dfs)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 26,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/html": [
161 |        "<div>\n",
162 |        "<style scoped>\n",
163 |        "    .dataframe tbody tr th:only-of-type {\n",
164 |        "        vertical-align: middle;\n",
165 |        "    }\n",
166 |        "\n",
167 |        "    .dataframe tbody tr th {\n",
168 |        "        vertical-align: top;\n",
169 |        "    }\n",
170 |        "\n",
171 |        "    .dataframe thead th {\n",
172 |        "        text-align: right;\n",
173 |        "    }\n",
174 |        "</style>\n",
175 |        "<table border=\"1\" class=\"dataframe\">\n",
176 |        "  <thead>\n",
177 |        "    <tr style=\"text-align: right;\">\n",
178 |        "      <th></th>\n",
179 |        "      <th>header</th>\n",
180 |        "      <th>contents</th>\n",
181 |        "      <th>type</th>\n",
182 |        "      <th>page_url</th>\n",
183 |        "      <th>is_howto</th>\n",
184 |        "      <th>char_count</th>\n",
185 |        "      <th>word_count</th>\n",
186 |        "    </tr>\n",
187 |        "  </thead>\n",
188 |        "  <tbody>\n",
189 |        "    <tr>\n",
190 |        "      <th>10</th>\n",
191 |        "      <td>**Store and load objects to/from a known S3 lo...</td>\n",
192 |        "      <td>The above examples inferred the S3 location ba...</td>\n",
193 |        "      <td>H4</td>\n",
194 |        "      <td>https://docs.metaflow.org/scaling/data#store-a...</td>\n",
195 |        "      <td>False</td>\n",
196 |        "      <td>1367</td>\n",
197 |        "      <td>219</td>\n",
198 |        "    </tr>\n",
199 |        "    <tr>\n",
200 |        "      <th>0</th>\n",
201 |        "      <td>Deploying Variants of Event-Triggered Flows</td>\n",
202 |        "      <td>Consider this advanced scenario: You have depl...</td>\n",
203 |        "      <td>H1</td>\n",
204 |        "      <td>https://docs.metaflow.org/production/event-tri...</td>\n",
205 |        "      <td>False</td>\n",
206 |        "      <td>983</td>\n",
207 |        "      <td>119</td>\n",
208 |        "    </tr>\n",
209 |        "    <tr>\n",
210 |        "      <th>99</th>\n",
211 |        "      <td>[Fix `environment is not callable` error when ...</td>\n",
212 |        "      <td>Using `@environment` would often result in an ...</td>\n",
213 |        "      <td>H4</td>\n",
214 |        "      <td>https://docs.metaflow.org/internals/release-no...</td>\n",
215 |        "      <td>False</td>\n",
216 |        "      <td>279</td>\n",
217 |        "      <td>34</td>\n",
218 |        "    </tr>\n",
219 |        "    <tr>\n",
220 |        "      <th>71</th>\n",
221 |        "      <td>State Machine execution history logging for AW...</td>\n",
222 |        "      <td>Metaflow now logs [State Machine execution his...</td>\n",
223 |        "      <td>H4</td>\n",
224 |        "      <td>https://docs.metaflow.org/internals/release-no...</td>\n",
225 |        "      <td>False</td>\n",
226 |        "      <td>587</td>\n",
227 |        "      <td>72</td>\n",
228 |        "    </tr>\n",
229 |        "    <tr>\n",
230 |        "      <th>3</th>\n",
231 |        "      <td>Single Flow, multiple developers</td>\n",
232 |        "      <td>If `ProjectFlow` did not have a `@project deco...</td>\n",
233 |        "      <td>H3</td>\n",
234 |        "      <td>https://docs.metaflow.org/production/coordinat...</td>\n",
235 |        "      <td>False</td>\n",
236 |        "      <td>1773</td>\n",
237 |        "      <td>232</td>\n",
238 |        "    </tr>\n",
239 |        "  </tbody>\n",
240 |        "</table>\n",
241 |        "</div>"
242 |       ],
243 |       "text/plain": [
244 |        "                                               header  \\\n",
245 |        "10  **Store and load objects to/from a known S3 lo...   \n",
246 |        "0         Deploying Variants of Event-Triggered Flows   \n",
247 |        "99  [Fix `environment is not callable` error when ...   \n",
248 |        "71  State Machine execution history logging for AW...   \n",
249 |        "3                    Single Flow, multiple developers   \n",
250 |        "\n",
251 |        "                                             contents type  \\\n",
252 |        "10  The above examples inferred the S3 location ba...   H4   \n",
253 |        "0   Consider this advanced scenario: You have depl...   H1   \n",
254 |        "99  Using `@environment` would often result in an ...   H4   \n",
255 |        "71  Metaflow now logs [State Machine execution his...   H4   \n",
256 |        "3   If `ProjectFlow` did not have a `@project deco...   H3   \n",
257 |        "\n",
258 |        "                                             page_url  is_howto  char_count  \\\n",
259 |        "10  https://docs.metaflow.org/scaling/data#store-a...     False        1367   \n",
260 |        "0   https://docs.metaflow.org/production/event-tri...     False         983   \n",
261 |        "99  https://docs.metaflow.org/internals/release-no...     False         279   \n",
262 |        "71  https://docs.metaflow.org/internals/release-no...     False         587   \n",
263 |        "3   https://docs.metaflow.org/production/coordinat...     False        1773   \n",
264 |        "\n",
265 |        "    word_count  \n",
266 |        "10         219  \n",
267 |        "0          119  \n",
268 |        "99          34  \n",
269 |        "71          72  \n",
270 |        "3          232  "
271 |       ]
272 |      },
273 |      "execution_count": 26,
274 |      "metadata": {},
275 |      "output_type": "execute_result"
276 |     }
277 |    ],
278 |    "source": [
279 |     "df.sample(5)"
280 |    ]
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "RAG demo",
286 |    "language": "python",
287 |    "name": "rag-demo"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 3
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython3",
299 |    "version": "3.11.4"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 4
304 | }
305 | 


--------------------------------------------------------------------------------
/rag_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/__init__.py


--------------------------------------------------------------------------------
/rag_tools/databases/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/databases/__init__.py


--------------------------------------------------------------------------------
/rag_tools/databases/vector_database.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | class AbstractDB:
  5 | 
  6 |     def __init__(self):
  7 |         pass
  8 | 
  9 |     def create_index(self, **kwargs):
 10 |         pass
 11 | 
 12 |     def upsert(self, **kwargs):
 13 |         pass
 14 | 
 15 |     def vector_search(self, **kwargs):
 16 |         pass
 17 | 
 18 |     def destroy_index(self, **kwargs):
 19 |         pass
 20 | 
 21 | 
 22 | class PineconeDB(AbstractDB):
 23 | 
 24 |     def __init__(self,):
 25 |         super().__init__()
 26 |         import pinecone
 27 |         pinecone.init(
 28 |             api_key=os.environ['PINECONE_API_KEY'],
 29 |             environment=os.environ['GCP_ENVIRONMENT']
 30 |         )
 31 | 
 32 |     def create_index(self, index_name, dimension, metric='cosine'):
 33 |         import pinecone
 34 | 
 35 |         if index_name not in pinecone.list_indexes():
 36 |             pinecone.create_index(
 37 |                 name=index_name,
 38 |                 dimension=dimension,
 39 |                 metric='cosine'
 40 |             )
 41 |             # wait a moment for the index to be fully initialized
 42 |             time.sleep(1)
 43 | 
 44 |     def upsert(self, index_name, embeddings, texts, ids):
 45 |         import pinecone
 46 | 
 47 |         # now connect to the index
 48 |         index = pinecone.GRPCIndex(index_name)
 49 | 
 50 |         # upsert the vectors, but this should be done in batches not one by one.
 51 |         print("Upserting vectors", end="")
 52 |         for idx, (txt, emb) in enumerate(zip(texts, embeddings)):
 53 |             upsert_response = index.upsert(
 54 |                 vectors=[
 55 |                     {'id': f'vec{idx}',
 56 |                      'values': emb.tolist(),
 57 |                      'metadata': {'text': txt},
 58 |                      }
 59 |                 ]
 60 |             )
 61 |             print(".", end="")
 62 | 
 63 |     def vector_search(self, index_name, vector, k=1):
 64 |         import pinecone
 65 |         index = pinecone.GRPCIndex(index_name)
 66 |         xc = index.query(vector.tolist(), top_k=k, include_metadata=True)
 67 |         return xc
 68 | 
 69 |     def destroy_index(self, index_name):
 70 |         import pinecone
 71 |         pinecone.delete_index(index_name)
 72 | 
 73 | 
 74 | class LanceDB(AbstractDB):
 75 | 
 76 |     """
 77 |     LanceDB is a vector database that uses Lance to store and search vectors.
 78 |     """
 79 |     
 80 |     def __init__(self):
 81 |         super().__init__()
 82 |         self.mode = 'overwrite'
 83 |         self.dataset_path = "../../chatbot.lance"
 84 |         self.local_store = True
 85 | 
 86 |     def create_index(self, table_name, embeddings, texts, ids):
 87 | 
 88 |         import lance
 89 |         import pandas as pd
 90 |         import pyarrow as pa
 91 |         from lance.vector import vec_to_table
 92 | 
 93 |         data = pd.DataFrame({"text": texts, "id": ids})
 94 |         table = vec_to_table(embeddings)
 95 |         combined = pa.Table.from_pandas(data).append_column("vector", table["vector"])
 96 | 
 97 |         if self.local_store:
 98 |             ds = lance.write_dataset(combined, self.dataset_path, mode=self.mode)
 99 | 
100 |     def upsert(self, table_name, embeddings, texts, ids):
101 |         raise NotImplementedError("This LanceDB wrapper does not have upsert functionality beyond the create_index step yet.")
102 | 
103 |     def vector_search(self, table_name, vector, k=3):
104 |         import lance
105 |         ds = lance.dataset(self.dataset_path)
106 |         return ds.to_table(
107 |             nearest={
108 |                 "column": "vector",
109 |                 "k": k,
110 |                 "q": vector,
111 |                 "nprobes": 20,
112 |                 "refine_factor": 100
113 |             }).to_pandas()
114 | 
115 |     def destroy_index(self, table_name):
116 |         if self.local_store:
117 |             import shutil
118 |             shutil.rmtree(self.dataset_path)


--------------------------------------------------------------------------------
/rag_tools/embedders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/embedders/__init__.py


--------------------------------------------------------------------------------
/rag_tools/embedders/embedder.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class AbstractEmbedder:
 4 |     def __init__(self, **kwargs):
 5 |        pass
 6 | 
 7 | class OpenAIEmbedder(AbstractEmbedder):
 8 | 
 9 |     def __init__(self):
10 |         super().__init__()
11 | 
12 |     def embed(self, sentences):
13 |         pass
14 | 
15 | 
16 | class SentenceTransformerEmbedder(AbstractEmbedder):
17 |     def __init__(self, model_path, device="cpu"):
18 |         super().__init__()
19 |         from sentence_transformers import SentenceTransformer
20 | 
21 |         self.model = SentenceTransformer(model_path)
22 |         self.model.to(device)
23 | 
24 |     def embed(self, sentences):
25 |         return self.model.encode(sentences, show_progress_bar=True)
26 | 
27 | 
28 | class FashionCLIPEmbedder(AbstractEmbedder):
29 |     def __init__(self):
30 |         super().__init__()
31 |         from fashion_clip.fashion_clip import FashionCLIP
32 | 
33 |         self.fclip = FashionCLIP('fashion-clip')
34 | 
35 |     def embed(self, images):
36 |         import numpy as np
37 |         image_embeddings = self.fclip.encode_images(images, batch_size=32)
38 | 
39 |         image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True)
40 |         return image_embeddings
41 | 
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/rag_tools/filetypes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/filetypes/__init__.py


--------------------------------------------------------------------------------
/rag_tools/filetypes/markdown.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | from functools import partial
  4 | import os
  5 | import pickle
  6 | from datetime import datetime
  7 | import frontmatter
  8 | import re
  9 | import pandas as pd
 10 | from slugify import slugify
 11 | 
 12 | 
 13 | HEADERS_REGEX = re.compile("#{1,5}")
 14 | NUMBERED_HEADER_REGEX = re.compile("")
 15 | END_ESCAPE = "< END >"
 16 | QUESTION_HEADER = "## Question"
 17 | 
 18 | 
 19 | def is_useful_line(line):
 20 |     """
 21 |     Utility function to decide if this is some autogenerated or unhelpful line from filetypes pages, which can be generated in a variety of ways.
 22 |     """
 23 |     # if line == "\n": # New lines are helpful in the sections to determine paragraph chunks. Might want to make this a generalized filter instead of hand coded heuristics.
 24 |     #     return False
 25 |     if line == "---\n":
 26 |         return False
 27 |     if line == "\n---":
 28 |         return False
 29 |     if "<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT!" in line:
 30 |         return False
 31 |     return True
 32 | 
 33 | 
 34 | def find_question(lines):
 35 |     """
 36 |     Find the line that contains a question in an Outerbounds how-to guide.
 37 |     Return it with its index.
 38 |     """
 39 | 
 40 |     # TODO: Check that this is a How to guide.
 41 | 
 42 |     # check QUESTION_HEADER is in post
 43 |     full_str = "".join(lines)
 44 |     if not QUESTION_HEADER in full_str:
 45 |         return -1e12, "Question not found."
 46 | 
 47 |     for i, line in enumerate(lines):
 48 |         if QUESTION_HEADER in line:
 49 |             return i, lines[i + 1]
 50 | 
 51 | 
 52 | def get_contents_until_next_section(lines, start_idx=0):
 53 | 
 54 |     """
 55 |     Get the contents of a section of a docusaurus markdown file.
 56 |     """
 57 | 
 58 |     _i = start_idx + 1
 59 |     N = len(lines)
 60 |     contents = []
 61 |     while _i < N and not lines[_i].startswith("#"):
 62 |         try:
 63 |             if lines[_i + 1].startswith("-"):
 64 |                 break
 65 |         except IndexError:
 66 |             pass  # end of file
 67 |         contents.append(lines[_i])
 68 |         _i += 1
 69 |     return " ".join(contents).strip()
 70 | 
 71 | 
 72 | def parse_md_file_headers(markdown_file_path, deployment_url, subdir):
 73 | 
 74 |     """
 75 |     Parse the header sections of a docusaurus markdown file.
 76 | 
 77 |     :param markdown_file_path: str - path to a markdown file.
 78 |     :param deployment_url: str - the url of the deployment used to generate links to web pages.
 79 |     :param subdir: str - the subdirectory of the repo where the markdown files should be walked for.
 80 | 
 81 |     Return a dataframe with the following columns:
 82 |         - header: str - the header text
 83 |         - contents: str - the contents of the section the header opens
 84 |         - type: str - the type of header, e.g. H1, H2, H3, H4, H5
 85 |         - page_url: str - the url of the page this header is on with the header slug appended to it
 86 | 
 87 |     """
 88 | 
 89 |     if not (
 90 |         deployment_url.startswith("https://") or deployment_url.startswith("http://")
 91 |     ):
 92 |         deployment_url = "https://" + deployment_url
 93 | 
 94 |     def _try_read_line(_i, lines):
 95 |         try:
 96 |             return lines[_i]
 97 |         except IndexError:
 98 |             return END_ESCAPE
 99 | 
100 |     page_type = get_page_type(markdown_file_path)
101 | 
102 |     def _construct_url(header):
103 |         def strip_slug(s):
104 |             return s.replace(">", "").replace("<", "").replace(".", "").replace("=", "")
105 | 
106 |         oss_release_notes_regex = "\d.\d.\d \((.*)\d{4}\)"
107 |         res = re.match(oss_release_notes_regex, header)
108 |         if res:
109 |             slug = slugify(strip_slug(res.group()))
110 |         else:
111 |             slug = slugify(strip_slug(header))
112 |         if "github" in slug:
113 |             slug = slug.split("-https-github")[0]
114 | 
115 |         with open(markdown_file_path, "r") as f:
116 |             fm_meta = frontmatter.load(f).metadata
117 |         if page_type == "blog":
118 |             url = os.path.join(deployment_url, subdir, fm_meta["slug"] + "#" + slug)
119 |         elif page_type == "docs":
120 |             try:
121 |                 prefix = fm_meta["slug"][1:]
122 |             except KeyError:
123 |                 prefix = markdown_file_path.split("docs")[-1].split(".")[0]
124 |             if deployment_url.endswith("/") or prefix.startswith("/"):
125 |                 url = deployment_url + prefix + "#" + slug
126 |             else:
127 |                 url = os.path.join(deployment_url, prefix + "#" + slug)
128 | 
129 |         return url
130 | 
131 |     def _add_one_row(headers_df, header, embeddable_chunk, type, page_url):
132 |         "This function is only intended to be used with parse_md_file_headers below."
133 |         headers_df["header"].append(header)
134 |         headers_df["contents"].append(embeddable_chunk)
135 |         headers_df["type"].append(type)
136 |         headers_df["page_url"].append(page_url)
137 | 
138 |     def _process_special_heading(x):
139 |         "Artifact of how Outerbounds customizes docusaurus markdown files."
140 |         numbering_regex = re.compile("<NumberHeading number=(.*)</NumberHeading>")
141 |         res = numbering_regex.match(x)
142 |         if res and res.group() == x:
143 |             return x.split(">")[1].split("<")[0].strip()
144 |         return x
145 | 
146 |     headers_df = {
147 |         "header": [],
148 |         "contents": [],  # This is a list of embedding chunks.
149 |         "type": [],
150 |         "page_url": [],
151 |     }
152 | 
153 |     lines = list(filter(is_useful_line, get_lines(markdown_file_path)))
154 | 
155 |     code_block_open = False
156 |     for i, line in enumerate(lines):
157 | 
158 |         if not code_block_open:
159 | 
160 |             if line.startswith("#"):
161 |                 # This is a header line.
162 |                 n_hashes = len(HEADERS_REGEX.match(line).group())
163 |                 embeddable_chunk = get_contents_until_next_section(lines, start_idx=i)
164 |                 header = _process_special_heading(line.strip().replace("#", "").strip())
165 |                 page_url = _construct_url(header)
166 |                 _add_one_row(
167 |                     headers_df,
168 |                     header,
169 |                     embeddable_chunk,
170 |                     "H{}".format(n_hashes),
171 |                     page_url,
172 |                 )
173 | 
174 |             elif line.startswith("--"):
175 | 
176 |                 # Another way header lines get specified in .md docs is above --* lines.
177 |                 # Walk up lines until we find the header for this section.
178 |                 _i = i - 1
179 |                 _prev = _try_read_line(_i, lines)
180 |                 while "\n" in _prev.strip():
181 |                     res = _try_read_line(_i, lines)
182 |                     if res != END_ESCAPE:
183 |                         _prev = res
184 |                     else:
185 |                         break
186 |                     _i -= 1
187 |                 header = _process_special_heading(
188 |                     _prev.strip().replace("#", "").strip()
189 |                 )
190 |                 embeddable_chunk = get_contents_until_next_section(lines, start_idx=i)
191 |                 page_url = _construct_url(header)
192 |                 _add_one_row(headers_df, header, embeddable_chunk, "H2", page_url)
193 | 
194 |             elif line.startswith("```"):
195 |                 code_block_open = True
196 |                 # TODO: do something wiser with these code blocks.
197 | 
198 |         else:
199 |             # check if code block is closing
200 |             if line.startswith("```"):
201 |                 code_block_open = False
202 | 
203 |     headers_df = pd.DataFrame(headers_df)
204 |     headers_df["is_howto"] = (
205 |         headers_df.header.apply(lambda s: s.lower()) == "question"
206 |     )
207 |     headers_df["char_count"] = [len(c) for c in headers_df["contents"]]
208 |     headers_df["word_count"] = [len(c.split(" ")) for c in headers_df["contents"]]
209 | 
210 |     return headers_df
211 | 
212 | 
213 | def headers_df_from_file_list(md_file_path_list, deployment_url, subdir):
214 |     """
215 |     Fetches headers from all markdown files in the md_file_path_list directory.
216 |     """
217 | 
218 |     headers_df = pd.DataFrame()
219 |     for md_file_path in md_file_path_list:
220 |         headers_df_iter = parse_md_file_headers(
221 |             md_file_path, deployment_url=deployment_url, subdir=subdir
222 |         )
223 |         headers_df = pd.concat([headers_df, headers_df_iter], axis=0)
224 | 
225 |     headers_df.index = range(len(headers_df))
226 |     headers_df = headers_df[headers_df["char_count"] > 0]
227 |     return headers_df
228 | 
229 | 
230 | def get_page_type(md_file):
231 |     if "blog" in md_file:
232 |         return "blog"
233 |     elif "docs" in md_file:
234 |         return "docs"
235 |     else:
236 |         raise ValueError(
237 |             "process_docusaurus_page only tested against docs or blog page, not {}".format(
238 |                 md_file
239 |             )
240 |         )
241 | 
242 | 
243 | def process_docusaurus_page(md_file):
244 | 
245 |     # extract front matter
246 |     with open(md_file, "r") as f:
247 |         fm_meta = frontmatter.load(f).metadata
248 | 
249 |     # fetch full document
250 |     with open(md_file, "r") as f:
251 |         unfiltered_contents = f.readlines()
252 | 
253 |     filtered_contents = list(filter(is_useful_line, unfiltered_contents))
254 |     processed_contents = list(map(lambda s: s.replace("\n", ""), filtered_contents))
255 | 
256 |     # identify type of the page
257 |     page_type = get_page_type(md_file)
258 | 
259 |     # extract top-level question for how-tos
260 |     if page_type == "docs":
261 |         cutoff_idx, question = find_question(processed_contents)
262 | 
263 |     return processed_contents, fm_meta
264 | 
265 | 
266 | def save_docusaurus_page_as_data_sample(local_in_path, root_dir=os.path.abspath(".")):
267 |     """
268 |     TODO: address magic formatting/number vibes in local_out_path = ... and 'full text': ... lines
269 |     """
270 |     assert os.path.exists(root_dir)  # TODO: throw error with useful message
271 |     local_out_path = os.path.join(
272 |         root_dir,
273 |         local_in_path.split("/")[-1].replace(
274 |             ".md", f"_{datetime.now().strftime('%Y-%m-%d')}.pkl"
275 |         ),
276 |     )
277 |     lines, fm = process_docusaurus_page(local_in_path)
278 |     result = {"full text": "\n".join(lines), "front matter": fm}
279 |     with open(local_out_path, "wb") as handle:
280 |         pickle.dump(result, handle, protocol=pickle.HIGHEST_PROTOCOL)
281 |     return local_out_path
282 | 
283 | 
284 | def load_docusaurus_page_data_sample(local_path):
285 |     with open(local_path, "rb") as handle:
286 |         return pickle.load(handle)
287 | 
288 | 
289 | def find_markdown_links(file_path):
290 |     links = []
291 |     link_pattern = r"\[([^\]]+)\]\(([^)]+)\)"  # regex pattern for [text](url)
292 | 
293 |     with open(file_path, "r", encoding="utf-8") as file:
294 |         content = file.read()
295 | 
296 |     for match in re.finditer(link_pattern, content):
297 |         text, url = match.groups()
298 |         links.append((text, url))
299 | 
300 |     return links
301 | 
302 | 
303 | def get_lines(file_path):
304 |     with open(file_path, "r", encoding="utf-8") as file:
305 |         lines = file.readlines()
306 |     return lines
307 | 
308 | 
309 | 
310 | class Mixin:
311 | 
312 |     def headers_df_parser(self, filepath, filename, deployment_url, subdir) -> pd.DataFrame:
313 |         """
314 |         Example of a custom parser. 
315 |         This one parses markdown files one section/row per header and returns a dataframe.
316 |         The section markdown is still raw in the contents column.
317 |         """
318 |         return parse_md_file_headers(filepath, deployment_url=deployment_url, subdir=subdir)
319 | 
320 | 
321 |     def load_df_from_repo_list(self) -> pd.DataFrame:
322 | 
323 |         """
324 |         Built to parse the headers of the self.repo_params
325 |         Assumes self.repo_params is set. 
326 |         See: config/repo_params.py and markdown_chunker.py for example.
327 |         """
328 | 
329 |         from rag_tools.repo.ops import ensure_repo_exists_locally, DocumentationExtractor
330 |         import pandas as pd
331 | 
332 |         headers_df = pd.DataFrame()
333 | 
334 |         for params in self.repo_params:
335 | 
336 |             if not (
337 |                 params["repository_path"].startswith("https") or \
338 |                 params["repository_path"].startswith("http") or \
339 |                 params["repository_path"].startswith("ssh") or \
340 |                 params["repository_path"].startswith("git")
341 |             ):
342 |                 print("Looking for local repository at %s" % params["repository_path"])
343 |                 extractor = DocumentationExtractor(
344 |                     local_repo_path = ensure_repo_exists_locally(
345 |                         local_repo_path = params["repository_path"],
346 |                         ref = params["repository_ref"],
347 |                     )
348 |                 )
349 | 
350 |             else:
351 |                 print("Looking for remote repository at %s" % params["repository_path"])
352 |                 extractor = DocumentationExtractor(repo_url=params["repository_path"])
353 | 
354 |             _dfs = extractor.extract(
355 |                 base_path=params["base_search_path"],
356 |                 ref=params["repository_ref"],
357 |                 exclude_paths=params["exclude_paths"],
358 |                 exclude_files=params["exclude_files"],
359 |                 considered_extensions=[".md"],
360 |                 parser=partial(self.headers_df_parser, deployment_url=params['deployment_url'], subdir=params['base_search_path']),
361 |             )
362 |             _df = pd.concat(_dfs, axis=0)
363 |             _df.index = range(len(_df))
364 | 
365 |             headers_df = pd.concat([headers_df, _df], axis=0, ignore_index=True)
366 | 
367 |         return headers_df
368 | 


--------------------------------------------------------------------------------
/rag_tools/llms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/llms/__init__.py


--------------------------------------------------------------------------------
/rag_tools/llms/llms_search.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | from transformers import AutoTokenizer
 3 | import transformers
 4 | import torch
 5 | 
 6 | class Llama2ChatWrapper:
 7 | 
 8 |     def __init__(self, model="meta-llama/Llama-2-7b-chat-hf"):
 9 |         self.model = model
10 | 
11 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model)
12 |         self.pipeline = transformers.pipeline(
13 |             "text-generation",
14 |             model=self.model,
15 |             torch_dtype=torch.float16,
16 |             device_map="auto",
17 |         )
18 | 
19 |     def sample(self, text, max_tokens=200):
20 |         sequences = self.pipeline(
21 |             f'{text}\n',
22 |             do_sample=True,
23 |             top_k=10,
24 |             num_return_sequences=1,
25 |             eos_token_id=self.tokenizer.eos_token_id,
26 |             max_length=max_tokens,
27 |         )
28 | 
29 |         # return the first sequence
30 |         for seq in sequences:
31 |             return seq['generated_text']
32 | 
33 | 
34 | class ChatGPTWrapper:
35 |     def __init__(self, api_key, model_name="gpt-3.5-turbo"):
36 |         """
37 |         Currently only handles single message sessions.
38 |         """
39 |         self.model_name = model_name
40 |         openai.api_key = api_key
41 | 
42 |     def sample(self, text, temperature=0.8, max_tokens=100):
43 |         metadata = {"model": self.model_name,
44 |                     "temperature": temperature,
45 |                     "max_tokens": max_tokens}
46 | 
47 |         # sample from the openai model
48 | 
49 |         r = openai.ChatCompletion.create(
50 |             messages=[
51 |                 {"role": "system", "content": "You are a search engine that replies to query based on context. Answer with "
52 |                                               "information provided within the context otherwise reply with 'I don't know'"},
53 |                 {"role": "user", "content": text}
54 |             ],
55 |             **metadata
56 |         )
57 |         return r['choices'][0]["message"]["content"]
58 | 


--------------------------------------------------------------------------------
/rag_tools/repo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/repo/__init__.py


--------------------------------------------------------------------------------
/rag_tools/repo/ops.py:
--------------------------------------------------------------------------------
  1 | from git import Repo
  2 | from git.exc import InvalidGitRepositoryError
  3 | import os
  4 | import tempfile
  5 | import numpy as np
  6 | 
  7 | class DocumentationExtractor:
  8 | 
  9 |     """
 10 |     Based on code from here:https://github.com/outerbounds/mf-data-extractors
 11 |     """
 12 | 
 13 |     def __init__(self, repo_url=None, local_repo_path=None):
 14 | 
 15 |         if local_repo_path is not None:
 16 |             self.local_repo_path = local_repo_path
 17 |         else:
 18 |             self.local_repo_path = None
 19 |             self.repo_url = repo_url
 20 | 
 21 | 
 22 |     @staticmethod
 23 |     def filter_files(
 24 |         path_to_dir, 
 25 |         base_search_path, 
 26 |         exclude_paths=None, 
 27 |         exclude_files=[],
 28 |         considered_extensions=[]
 29 |     ):
 30 | 
 31 |         """
 32 |         Fetch all files in a directory, excluding those in the exclude_paths list.
 33 | 
 34 |         :param path_to_dir: Local path to the repository/directory to search.
 35 |         :param base_search_path: Subdirectory to search within the repo.
 36 |         :param exclude_paths: List of paths to exclude. 
 37 |         :param considered_extensions: List of file extensions to consider.
 38 |         """
 39 |     
 40 |         if exclude_paths is None:
 41 |             exclude_paths = []
 42 | 
 43 |         # Convert exclude paths to absolute paths
 44 |         exclude_paths = [
 45 |             os.path.join(path_to_dir, exclude_path) 
 46 |             for exclude_path in exclude_paths
 47 |         ]
 48 |         selected_files = []
 49 | 
 50 |         path_is_excluded = lambda pth: any(
 51 |             [True if x in pth else False for x in exclude_paths]
 52 |         )
 53 |         extension_is_valid = (
 54 |             lambda pth: True
 55 |             if len(considered_extensions) == 0
 56 |             else any([pth.endswith(x) for x in considered_extensions])
 57 |         )
 58 |         base_abs_path = os.path.join(path_to_dir, base_search_path)
 59 | 
 60 |         for root, _, files in os.walk(base_abs_path):
 61 | 
 62 |             # Skip excluded paths
 63 |             if path_is_excluded(os.path.abspath(root)):
 64 |                 continue
 65 | 
 66 |             for file in files:
 67 |                 fp = os.path.join(os.path.abspath(root), file)
 68 |                 if fp in exclude_paths:
 69 |                     continue
 70 |                 if np.any([
 71 |                     file.endswith(exclude_file_ext) 
 72 |                     for exclude_file_ext in exclude_files
 73 |                 ]):
 74 |                     continue
 75 |                 if extension_is_valid(file):
 76 |                     selected_files.append((fp, os.path.relpath(fp, base_abs_path)))
 77 | 
 78 |         return selected_files
 79 | 
 80 |     def extract(
 81 |         self,
 82 |         base_path,
 83 |         ref="master",
 84 |         exclude_paths=[],
 85 |         exclude_files=[],
 86 |         considered_extensions=[],
 87 |         parser=None,
 88 |     ):
 89 | 
 90 |         """
 91 |         Extract data from a repository.
 92 |         Tries to clone the repository to a temporary directory.
 93 |         Checks out the specified ref.
 94 |         Applies the parser to each file meeting criteria specified in args passed to self.filter_files.
 95 |         """
 96 | 
 97 |         assert parser is not None
 98 |         assert callable(parser)
 99 | 
100 |         if self.local_repo_path:
101 |             file_paths = self.filter_files(
102 |                 path_to_dir=self.local_repo_path,
103 |                 base_search_path=base_path,
104 |                 exclude_paths=exclude_paths,
105 |                 exclude_files=exclude_files,
106 |                 considered_extensions=considered_extensions,
107 |             )
108 |             parsed_data = [parser(fp, fn) for fp, fn in file_paths]
109 |         else:
110 |             with tempfile.TemporaryDirectory() as tmpdir:
111 |                 repo = Repo.clone_from(self.repo_url, tmpdir)
112 |                 repo.git.checkout(ref)
113 |                 file_paths = self.filter_files(
114 |                     path_to_dir=tmpdir,
115 |                     base_search_path=base_path,
116 |                     exclude_paths=exclude_paths,
117 |                     exclude_files=exclude_files,
118 |                     considered_extensions=considered_extensions,
119 |                 )
120 |                 parsed_data = [parser(fp, fn) for fp, fn in file_paths]
121 | 
122 |         return parsed_data
123 | 
124 | class RepoNotFoundException(Exception):
125 |     
126 |     def __init__(self, repo_name):
127 |         self.repo_name = repo_name
128 |         self.message = "Could not find `local_repo_path=%s`" % repo_name
129 |         super().__init__(self.message)
130 | 
131 | 
132 | def ensure_repo_exists_locally(local_repo_path=None, ref='master'):
133 | 
134 |     from git import Repo
135 |     from git.exc import NoSuchPathError
136 |     import os
137 | 
138 |     local_repo_path = os.path.expanduser(local_repo_path)
139 |     try:
140 |         repo = Repo(local_repo_path)
141 |     except NoSuchPathError:
142 |         raise RepoNotFoundException (f'\n\n{local_repo_path} is not a valid git repository path. It should look like: /path/to/repo. If you are trying to clone a remote repository it should be structured like git@github.com:<ORG or USERNAME>/<REPO NAME>.git')
143 |     except InvalidGitRepositoryError:
144 |         raise RepoNotFoundException (f'\n\n{local_repo_path} is not a valid git repository path. It should look like: /path/to/repo. If you are trying to clone a remote repository it should be structured like git@github.com:<ORG or USERNAME>/<REPO NAME>.git')
145 | 
146 |     repo.git.checkout(ref)
147 |     assert local_repo_path is not None
148 |     return local_repo_path


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # the custom library
 2 | git+https://github.com/outerbounds/rag-demo
 3 | 
 4 | # orchestration
 5 | pylint
 6 | metaflow
 7 | 
 8 | # data processing
 9 | gitpython
10 | python-frontmatter
11 | python-slugify
12 | pandas
13 | tldextract
14 | 
15 | # models
16 | langchain==0.0.262
17 | llama-index==0.8.0
18 | llama-cpp-python==0.1.77
19 | sentence-transformers
20 | openai
21 | 
22 | # vectorDB
23 | lancedb
24 | pinecone-client[grpc]
25 | 
26 | # interface
27 | matplotlib
28 | seaborn
29 | notebook
30 | ipywidgets
31 | ipykernel
32 | streamlit
33 | # jupyter_ai # incompatible with latest langchain :/
34 | 
35 | pyyaml
36 | # NOTE: conflict between pyyaml requirement of python-frontmatter?


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | setup(name='rag-demo', version='0.0.1', packages=find_packages())


--------------------------------------------------------------------------------