85 | ```
86 |
87 | Set the following environment variable too:
88 | ```
89 | export GCP_ENVIRONMENT=us-central1-gcp
90 | ```
91 |
92 | ### ▶️ Run the flows
93 | ```
94 | python flows/pinecone_index.py run
95 | python flows/lancedb_index.py run
96 | ```
--------------------------------------------------------------------------------
/chat_app.py:
--------------------------------------------------------------------------------
1 | from llama_index import Document, VectorStoreIndex, set_global_service_context, ServiceContext
2 | from llama_index.retrievers import VectorIndexRetriever
3 | from llama_index.response_synthesizers import get_response_synthesizer
4 | from llama_index.llms import OpenAI
5 | from dataclasses import dataclass
6 | from metaflow import Flow
7 | import streamlit as st
8 | from typing import List
9 | import time
10 | import os
11 |
12 | st.set_page_config(layout="centered")
13 | st.title("Metaflow Chat Bot")
14 | st.markdown("This is a bare bones LLM-powered chat bot that uses the results of Metaflow workflows to answer questions about Metaflow.")
15 |
16 | subquery_prompt = """
17 | Answer this question only if there is relevant context below: {}
18 | If there is nothing in the context say: "Could not find relevant context."
19 | Here is the retrieved context: {}
20 | """
21 |
22 | # model = st.text_input('OpenAI model', 'gpt-3.5-turbo')
23 | # temp = st.slider(label='Temperature', min_value=0.0, max_value=1.0, step=0.01, value=0.0)
24 | # chat_mode = st.text_input('LlamaIndex chat engine mode', 'react')
25 | # K = st.number_input('K results to return', min_value=1, max_value=5, value=2, step=1)
26 | model = 'gpt-3.5-turbo'
27 | temp = 0.0
28 | chat_mode = 'react'
29 | K = 2
30 |
31 | llm = OpenAI(model=model, temperature=temp, max_tokens=2048)
32 | service_context = ServiceContext.from_defaults(llm=llm)
33 | set_global_service_context(service_context)
34 |
35 | # find latest Metaflow run that saved processed df
36 | run = None
37 | for _run in Flow('DataTableProcessor'):
38 | if _run.data.save_processed_df:
39 | run = _run
40 | break
41 | st.write("Found processed df in run: {}".format(run.id))
42 |
43 | @dataclass
44 | class Context:
45 |
46 | def __init__(self, response: str, source_node_ids: List[str]):
47 | self.response = response
48 | self.source_node_ids = source_node_ids
49 |
50 | def get_link_df(self, meta_df, link_col = 'doc_id'):
51 | return meta_df[meta_df[link_col].isin(self.source_node_ids)]
52 |
53 | def qa_iter(
54 | question: str,
55 | index: VectorStoreIndex,
56 | k:int = 2,
57 | response_mode:str = 'tree_summarize'
58 | ) -> Context:
59 | "Match a question against an index and returns the response."
60 | retriever = VectorIndexRetriever(index=index, similarity_top_k=k)
61 | response_synthesizer = get_response_synthesizer(response_mode=response_mode)
62 | query_engine = index.as_query_engine(response_synthesizer=response_synthesizer, retriever=retriever)
63 | query_res = query_engine.query(question)
64 | return Context(
65 | response=query_res.response, source_node_ids=list(query_res.metadata.keys())
66 | )
67 |
68 | def st_output_format(similar_chunk_df):
69 | md_outputs = ["#### You may find these links helpful:"]
70 | for _, chunk in similar_chunk_df.iterrows():
71 | md_outputs.append(f"##### [{chunk.header}]({chunk.page_url})")
72 | md_outputs.append(f"{chunk.contents[:100]}...")
73 | if len(md_outputs) == 1:
74 | md_outputs = []
75 | return md_outputs
76 |
77 | def generative_search_engine_iter(question, index, meta_df, k=2, meta_df_id_col='doc_id'):
78 | "Assumes index and df are defined in the global scope"
79 | context = qa_iter(question, index, k=k)
80 | similar_chunk_df = meta_df[meta_df[meta_df_id_col].isin(context.source_node_ids)]
81 | return context.response, st_output_format(similar_chunk_df)
82 |
83 | # use the processed df to build the index
84 | def get_documents_from_content_section_df(df):
85 | ids = []; documents = []
86 | for i, text in enumerate(df.contents):
87 | doc = Document(text=text, id_=i)
88 | documents.append(doc)
89 | ids.append(doc.id_)
90 | return documents, ids
91 |
92 | @st.cache_resource
93 | def setup_index():
94 | df = run.data.processed_df
95 | documents, ids = get_documents_from_content_section_df(df)
96 | df['doc_id'] = ids
97 | index = VectorStoreIndex(documents)
98 | return index, df
99 |
100 | index, df = setup_index()
101 | chat_engine = index.as_chat_engine(chat_mode=chat_mode, verbose=True, streaming=True)
102 |
103 | # Initialize chat history
104 | st.markdown("# Chat history")
105 | if "messages" not in st.session_state:
106 | st.session_state.messages = []
107 |
108 | # Display chat messages from history on app rerun
109 | for message in st.session_state.messages:
110 | with st.chat_message(message["role"]):
111 | st.markdown(message["content"])
112 |
113 | # React to user input
114 | if user_prompt := st.chat_input("Hey Metaflower 🌻 what's on your mind?"):
115 |
116 | # Display user message in chat message container
117 | with st.chat_message("user"):
118 | st.markdown(user_prompt)
119 |
120 | # Add user message to chat history
121 | st.session_state.messages.append({"role": "user", "content": user_prompt})
122 |
123 | with st.spinner("Finding context..."):
124 | subquery_response, md_outputs = generative_search_engine_iter(user_prompt, index, df, K)
125 |
126 | # Display assistant response in chat message container
127 | with st.chat_message("Metaflow assistant"):
128 |
129 | message_placeholder = st.empty()
130 |
131 | streaming_response = chat_engine.stream_chat(subquery_prompt.format(user_prompt, subquery_response))
132 | full_response = ""
133 | for text in streaming_response.response_gen:
134 | full_response += text
135 | message_placeholder.markdown(full_response + "▌")
136 |
137 | for line in md_outputs:
138 | st.markdown(line)
139 |
140 | # Add assistant response to chat history
141 | st.session_state.messages.append({"role": "Metaflow assistant", "content": full_response})
142 |
143 | if st.button("Reset chat engine's memory"):
144 | chat_engine.reset()
--------------------------------------------------------------------------------
/data/test-data/mf-docs/docs/api/current.md:
--------------------------------------------------------------------------------
1 | # Current - Operating a run
2 |
3 |
4 |
5 | The `current` object is used to inspect and manipulate the currently executing run. It is only available during flow execution, i.e. inside a `FlowSpec` class and functions called from its steps. You can access the object simply by importing it: `from metaflow import current`.
6 |
7 | The attributes available in `current` depend on the decorators assigned to the flow and the step where `current` is used. Attributes that are always available are listed under *Common Attributes* below. Decorator-specific attributes are listed under the decorator name.
8 |
9 | ## Common Attributes
10 |
11 | These attributes are always available in the `current` object.
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 | ## Decorator-specific attributes
114 |
115 | These attributes are only available when the decorator is present.
116 |
117 | ### @project
118 |
119 | [The @project decorator](/production/coordinating-larger-metaflow-projects) exposes attributes related to the current deployment.
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 | ### @card
172 |
173 | [The @card decorator](/metaflow/visualizing-results) exposes functions in `current` that allow you to customize
174 | the contents of cards using [card components](/api/cards#Card-components). For an overview of card-related APIs, see [the API reference for cards](/api/cards).
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
--------------------------------------------------------------------------------
/data/test-data/mf-docs/docs/introduction/why-metaflow.md:
--------------------------------------------------------------------------------
1 | ---
2 | pagination_prev: index
3 | ---
4 |
5 | import Caption from "/src/components/Caption";
6 |
7 | # Why Metaflow
8 |
9 | ### 1. Modern businesses are eager to utilize data science and ML
10 |
11 | In the past, data scientists and ML engineers had to rely
12 | on a medley of point solutions and custom systems to build ML and data science
13 | applications.
14 |
15 | 
16 |
17 | ---
18 |
19 | ### 2. What is common in DS/ML applications?
20 |
21 | Applications can be built quicker and more robustly if they stand on a common, human-friendly foundation. But what should the foundation cover?
22 |
23 | 
24 |
25 | ---
26 |
27 | ### 3. All DS/ML applications use data
28 |
29 | **Data** may come in different shapes and sizes and may be loaded from various data stores. However, no matter what data is used, accessing and processing it shouldn't be too cumbersome.
30 |
31 | 
32 |
33 | ---
34 |
35 | ### 4. DS/ML applications need to perform computation
36 |
37 | Some applications require a tremendous amount of compute power - think computer vision - while some do with less.
38 | Regardless of the scale, all applications need to perform **computation** reliably. Thanks to cloud computing,
39 | data scientists and ML engineers should be able to utilize elastic compute resources without friction.
40 |
41 | 
42 |
43 | ---
44 |
45 | ### 5. DS/ML applications consists of multiple interconnected parts
46 |
47 | Consider an application that loads data, transforms it, trains a bunch of models, chooses the best performing one, runs
48 | inference, and writes the results to a database. Multi-steps workflows like this are a norm in ML. **A workflow
49 | orchestrator** is needed to make sure all steps get executed in order, on time.
50 |
51 | 
52 |
53 | ---
54 |
55 | ### 6. DS/ML applications evolve over time incrementally
56 |
57 | Rarely a real-world application is built and deployed only once. Instead, a typical application is built gradually,
58 | through contributions by many people. The project needs to be tracked, organized, and **versioned**, which enables systematic and
59 | continuous improvement over time.
60 |
61 | 
62 |
63 | ---
64 |
65 | ### 7. DS/ML applications produce business value in various ways
66 |
67 | To produce real business value, DS/ML applications can't live in a walled garden. They must be integrated with the surrounding systems seamlessly: Some applications enhance data in a database, some power internal dashboards or microservices, whereas some power user-facing products. There are many such ways to **deploy** ML in production. The more valuable the application, the more carefully it needs to be operated and monitored as well.
68 |
69 | 
70 |
71 | ---
72 |
73 | ### 8. DS/ML applications should leverage the best tools available
74 |
75 | For many data scientists and ML engineers, the most rewarding part of the project is **modeling**. Using their domain
76 | knowledge and expertise, the modeler should be able to choose the best tool for the job amongst off-the-shelf libraries, such as PyTorch, XGBoost, Scikit Learn, and many others. Or, if necessary, they should be able to use a wholly custom approach.
77 |
78 | 
79 |
80 | ---
81 |
82 | ### 9. Metaflow covers the full stack of DS/ML infrastructure
83 |
84 | Metaflow was originally created at Netflix, motivated by the realization that data scientists and ML engineers need
85 | help with all these concerns: Any gaps or friction in the stack can slow down the project drastically. Thanks to a
86 | common foundation
87 | provided by Metaflow, data scientists can iterate on ideas quickly and
88 | deploy them confidently by relying on a well-defined architecture and best practices, shared by everyone in the team.
89 |
90 | 
91 |
92 | ---
93 |
94 | ### 10. Metaflow takes care of the plumbing, so you can focus on the fun parts
95 |
96 | Metaflow provides a robust and user-friendly foundation for a wide spectrum of data-intensive applications, including most data science and ML use cases. Data scientists and ML engineers who know the basics of Python can build their own applications, models, and policies on top of it, while Metaflow takes care of the low-level infrastructure: data, compute, orchestration, and versioning.
97 |
98 | 
99 |
100 | ---
101 |
102 | ### 11. Metaflow relies on systems that engineers know and trust
103 |
104 | Metaflow was designed at Netflix to serve the needs of business-critical ML/DS applications. It relies on proven and scalable infrastructure which works for small and large organizations alike. Metaflow integrates with all the
105 | top clouds as well as with Kubernetes and systems around them in a responsible manner. It respects the security and other
106 | policies of your company, making engineering teams happy too.
107 |
108 | 
109 |
110 | ---
111 |
112 | ### 12. Metaflow is used by hundreds of innovative companies
113 |
114 | Today, Metaflow powers thousands of ML/DS applications at innovative companies such as
115 | [Netflix](https://netflixtechblog.com/supporting-content-decision-makers-with-machine-learning-995b7b76006f),
116 | [CNN](https://medium.com/cnn-digital/accelerating-ml-within-cnn-983f6b7bd2eb),
117 | [SAP](https://blogs.sap.com/2022/04/20/train-your-model-in-sap-ai-core-using-the-metaflow-argo-plugin/),
118 | [23andMe](https://medium.com/23andme-engineering/machine-learning-eeee69d40736),
119 | [Realtor.com](https://medium.com/realtor-com-innovation-blog/improving-data-science-processes-to-speed-innovation-at-realtor-com-b6b90fa530dc),
120 | [REA](https://www.rea-group.com/about-us/news-and-insights/blog/accelerating-experimentation-with-mlops/),
121 | [Coveo](https://outerbounds.com/blog/dataops-mlops-reasonable-organizations/),
122 | [Latana](https://aws.amazon.com/blogs/startups/brand-tracking-with-bayesian-statistics-and-aws-batch/),
123 | and
124 | hundreds of others across industries. Commercial support for Metaflow is provided by [Outerbounds](https://outerbounds.com).
125 | To hear first-hand experiences from these companies and many others, [join the Metaflow Slack](http://slack.outerbounds.co).
126 |
127 |
--------------------------------------------------------------------------------
/data/test-data/ob/blog/llm-tuning-metaflow.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Fine-tuning a Large Language Model using Metaflow, featuring LLaMA and LoRA"
3 | date: "2023-06-21T12:41:26"
4 | tags: ["AI", "Foundation Models", "LLM", "NLP"]
5 | keywords: ["AI", "Foundation Models", "LLM", "NLP"]
6 | categories: ["Foundation Models"]
7 | authors: [federico, obteam]
8 | description: "A workflow template built with Metaflow for fine-tuning LLMs for custom use cases."
9 | image: "/blog/cover/llm-tuning-metaflow.png"
10 | ogImage: "/blog/og/llm-tuning-metaflow.png"
11 | slug: "llm-tuning-metaflow"
12 | ---
13 |
14 | ---
15 |
16 | *This post demonstrates how you can use Metaflow to fine-tune a production-grade custom large language model (LLM) based on instruction tuning. We provide an open-source, fine-tuning workflow template that allows you to run models on local or cloud-based GPUs and connect the results to surrounding production systems using the complete toolchain of Metaflow.*
17 |
18 | [Federico](https://federicobianchi.io) is a post-doctoral researcher at Stanford University, working on NLP and Large Language Models. He frequently releases research projects as open-source tools that have collectively gathered thousands of [GitHub stars](https://github.com/vinid).
19 |
20 | ---
21 |
22 | Motivation
23 | ----------
24 | Unsurprisingly, as of June 2023, building production systems that leverage custom large language models takes a lot of work and money. As usual in any novel technical field, one faces a diverse, quickly moving landscape of immature tooling, missing documentation, and vibrant but chaotic communities. Developers face complex and varying dependencies of LLMs, spanning hardware drivers and bleeding-edge software frameworks, which can take hours and days to figure out for any given use case.
25 |
26 | Some astute companies may decide to wait a few years for the field to mature. However, many companies need or want to start innovating faster, and mature tools like [Metaflow](https://metaflow.org/) can help. Metaflow helps you design robust workflows to support any data-intensive application, allowing you to amortize the fixed cost of setting up systems - such as LLM experimentation pipelines - over the project's life. Using Metaflow, teams can set up appropriate human and technical workflows for LLM development and deployment, allowing them to focus on the business's specific needs.
27 |
28 | Metaflow has been supporting serious ML use cases like this for years. The foundational needs of LLM development don’t differ much from other earlier models (besides scale!), so Metaflow is a natural fit for building systems around custom LLMs.
29 |
30 | Foundational infrastructure for LLMs
31 | ------------------------------------
32 | If you just want to run the code and do not care about owning the supply chain or deploying the results to a production system, [Colab](https://colab.google/) and similar GPU-powered-notebook-as-a-service providers are a convenient way to access prototyping the needed infrastructure fast. Alternatively, projects like [GPT4All](https://gpt4all.io/index.html) make running LLM inference on a laptop without a GPU straightforward.
33 |
34 | However, as we discussed in [part one](/blog/custom-llm-tuning) of this series, there are cases where organizations that want to move beyond prototyping in notebooks and consider using LLMs in production need more substantial ML tooling and infrastructure: You need to consider how to connect models to surrounding systems, run training and inference and all supporting steps on a scalable compute platform, iterate and track models, code and data across versions, and have both technical and human workflows in place for continuous improvement and experiments. All this in business-critical systems with tight SLA, security, and compliance requirements.
35 |
36 | How Metaflow helps
37 | ------------------
38 | This article shows how to use Metaflow to instruction-tune a custom LLM, accompanied by [the full stack of production-grade ML tooling](https://outerbounds.com/docs/infra-stack/) that Metaflow provides. We use a few lines of Metaflow code to access and monitor GPUs in CoreWeave’s cloud, making it easy to produce, track, and report results in shareable [Metaflow cards](https://docs.metaflow.org/api/cards) when running off-the-shelf open-source LLM code.
39 |
40 | If you are unfamiliar with the features of Metaflow, we encourage you to look at the [Metaflow documentation](https://docs.metaflow.org). Crucially, all the functionality Metaflow provides can be helpful when developing and deploying real-world systems powered by LLMs.
41 |
42 | For instance, Metaflow enables us to explore different model sizes and variants while quickly tuning infrastructure requirements and scheduling runs by changing single values in Python code. Switching between models helps us explore rapidly without unnecessarily wasting money on under-utilized GPUs.
43 |
44 | 
45 | Top left: The lineage of the Alpaca-LoRA model used as a default in this post; Bottom: Steps in the Metaflow workflow; Top right: The output of the fine-tuning step is model checkpoints in HuggingFace
46 |
47 | As Metaflow runs any Python code, you can easily leverage state-of-the-art models and APIs from your favorite modeling framework. For instance, In the example workflow, we push checkpoints to the HuggingFace hub with a few lines of code. In our example, we followed the [HuggingFace documentation](https://huggingface.co/docs/transformers/main_classes/trainer#checkpoints) to save all the checkpoints to a local folder, using a specific pattern that is checkpoint-xxx/ where xxx is the number of steps.
48 |
49 | Using HuggingFace APIs with Metaflow makes it easy to create a model repo for each checkpoint, train on whatever hardware you need, and push the results back to the hub to use in an application and for downstream testing. Doing this and sharing incremental model checkpoints on teams can also hugely reduce costs. People need not recompute the same model training results and can instead readily pick up training from where a colleague left off.
50 |
51 | Instruction tuning with Metaflow
52 | --------------------------------
53 | For example, let’s fine-tune a recent LLaMA variant, taking an already trained LLaMA language model and training it further on an instruction tuning dataset. The learning task is still language modeling, but the training dataset contains specific instruction-following examples.
54 |
55 | In this case, the model learns to do language modeling relevant to the instructions in the Alpaca dataset. You can imagine using any dataset, such as how the Databricks team created a custom instruction-tuning [dataset](https://huggingface.co/datasets/databricks/databricks-dolly-15k) in [Dolly v2](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm). You can find additional information about Metaflow and large language models in our [blog post on tuning Dolly](https://outerbounds.com/blog/train-dolly-metaflow/).
56 |
57 | ### Show me the code
58 | The core concepts of this workflow template are the following:
59 | 1. HuggingFace to access the upstream models and to checkpoint our fine-tuned models,
60 | 2. CoreWeave to access GPU resources, and
61 | 3. Metaflow to structure the Python code, run it on the GPUs (carefully monitoring resource utilization with a Metaflow card), and track versions of code and models.
62 |
63 | 
64 |
65 | The following code shows how to call the main fine-tuning logic from the [Alpaca LoRA codebase](https://github.com/tloen/alpaca-lora).
66 |
67 |
68 | Use PyTorch, HuggingFace and Metaflow to build a workflow for fine-tuning LLMs.
69 |
70 |
71 | ```python
72 | # Contents of the flow.py file in the repository linked above.
73 |
74 | from metaflow import FlowSpec, step, Parameter, resources, environment
75 | from mixins import HuggingFaceLora, N_GPU, visible_devices
76 | from custom_decorators import pip, gpu_profile
77 | import os
78 |
79 | class LlamaInstructionTuning(FlowSpec, HuggingFaceLora):
80 |
81 | push_checkpoints = Parameter("-push", help="push checkpoints on huggingface", default=False, type=bool)
82 |
83 | @step
84 | def start(self):
85 | self.next(self.finetune)
86 |
87 | @environment(vars={
88 | "CUDA_VISIBLE_DEVICES": visible_devices,
89 | "WORLD_SIZE": N_GPU,
90 | "HUGGINGFACE_TOKEN": os.environ["HUGGINGFACE_TOKEN"],
91 | "HF_ORGANIZATION": os.environ["HF_ORGANIZATION"]
92 | })
93 | @gpu_profile(interval=1)
94 | @pip(file="requirements.txt")
95 | @resources(gpu=N_GPU, cpu=16, memory=128000) # tested with A100 and A6000 GPU.
96 | @step
97 | def finetune(self):
98 | self.run()
99 | if self.push_checkpoints:
100 | self.upload_to_huggingface()
101 | self.next(self.end)
102 |
103 | @step
104 | def end(self):
105 | pass
106 |
107 | if __name__ == '__main__':
108 | LlamaInstructionTuning()
109 | ```
110 |
111 | [Metaflow](https://metaflow.org/) provides [a scaffolding for data science workflows](https://docs.metaflow.org/metaflow/introduction), all written in Python. In this case, the flow centers around the `finetune` step, where we use the [multiple inheritance pattern](https://realpython.com/lessons/multiple-inheritance-python/) to modularize the workflow, separating the Alpaca LoRA code that makes HuggingFace API calls from the Metaflow code that organizes the workflow.
112 |
113 | Specifically, in the `HuggingFaceLora` class, we define the logic to run the open-source LLM fine-tuning code in a [distributed data-parallel](https://pytorch.org/docs/stable/notes/ddp.html?highlight=distributed+data+parallel) manner using the [torchrun](https://pytorch.org/docs/stable/elastic/run.html) API. The LLM code is run inside a Metaflow task, and a data scientist can add single lines of Python code to layer complex functionality on each task, [like requesting precise GPU and memory resources on the cloud instance that runs the job](https://docs.metaflow.org/scaling/remote-tasks/introduction) and monitoring the resources to ensure they are appropriately sized.
114 |
115 | The key idea for data science teams is that once they have access to a Metaflow deployment, a data scientist who can feasibly write Python code can [write, run, and deploy production-grade workflows in a single work session](https://docs.metaflow.org/introduction/what-is-metaflow). Metaflow [persists and tracks all artifacts automatically](https://docs.metaflow.org/metaflow/basics#artifacts), [exposes them to notebooks](https://docs.metaflow.org/metaflow/client) and other programmatic use cases, and shows them on a UI. Once everything works locally, the workflow can be [deployed to production, connecting to other workflows both upstream and downstream](https://docs.metaflow.org/production/introduction), with a single click.
116 |
117 | If you are curious to test all these features in action, you can [do it easily in the Metaflow sandbox](https://outerbounds.com/sandbox).
118 |
119 | Models
120 | ------
121 | We fine-tuned four of the recent LLaMA models on the same dataset with a fixed computing budget for each model; we used [Low-Rank Adaptation](https://arxiv.org/abs/2106.09685), making use of the recent [Alpaca LoRA repository](https://github.com/tloen/alpaca-lora). The models we fine-tuned are the 7B, 13B, 33B, and 65B parameters models, with the idea that larger models should provide better performance and answers. We saved multiple checkpoints for each model and chose the best one by validation loss after training.
122 |
123 | We chose the Alpaca dataset - built by generating instructions from GPT-3. We use a [ version](https://huggingface.co/datasets/yahma/alpaca-cleaned) of the dataset in which some errors are corrected. The dataset contains 50K instructions. You can find more details about how we used Metaflow in the practical considerations section.
124 |
125 | ### Quantitative evaluation
126 | We wanted to assess if our fine-tuned models are still performing as expected after fine-tuning. While there is still active discussion about how to evaluate instruction and chat models (see the [ChatBot Arena](https://lmsys.org/blog/2023-05-03-arena/), for example), we decided to evaluate our models against standard LLM benchmarks as a more customized evaluation approach is beyond the scope of this post. Specifically, we make use of the [LM-Harness](https://github.com/EleutherAI/lm-evaluation-harness) package developed by EleutherAI to evaluate our fine-tuned models on these different tasks in a zero-shot manner. We chose three question-answering benchmarks: [PIQA](https://huggingface.co/datasets/piqa) and [BoolQ](https://huggingface.co/datasets/boolq), and [OpenBookQA](https://huggingface.co/datasets/openbookqa).
127 |
128 | Here is an example from PIQA (the other datasets are similar):
129 |
130 | > “To separate egg whites from the yolk using a water bottle, you should…”
131 |
132 | The dataset has two possible answers:
133 |
134 | > “Squeeze the water bottle and press it against the yolk. Release, which creates suction and lifts the yolk.”
135 |
136 | > “Place the water bottle and press it against the yolk. Keep pushing, which creates suction and lifts the yolk.”
137 |
138 | Given that this is a multiple choice question task, [LM-Harness](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md#multiple-choice-format) runs the evaluation by comparing the likelihood of the label words.
139 |
140 | So, how well do these models work? Let’s look at some results (the evaluation metric here is accuracy).
141 |
142 |
143 |
144 |
145 | | Model | PIQA | BOOLQ | OPENBOOKQA |
146 | | ----- | ---- | ----- | ---------- |
147 | | LLAMA 7B | 0.7916 | 0.7511 | 0.4440 |
148 | | ALPACA 7B | 0.8030 | 0.7835 | 0.4420 |
149 | | LLAMA 13B | 0.8003 | 0.7798 | 0.4480 |
150 | | ALPACA 13B | 0.8030 | 0.8150 | 0.4620 |
151 |
152 |
153 |
154 |
155 | These results seem coherent with what was expected based on various leaderboards (see [LLaMA results](https://paperswithcode.com/paper/llama-open-and-efficient-foundation-language-1), for example) and papers. There is not a notably high increase in performance, as we expected. 13B parameters models do slightly better than their 7B counterparts. We do not expect a substantial increase in performance with respect to the original LLaMA models; instead, we would like to maintain similar performances on these benchmarks. Instruction tuning should give us models that follow instructions better and do not perform worse on general benchmarks. Our evaluation shows we have nicely performing models on the latter, but do they follow instructions?
156 |
157 | ### Qualitative evaluation
158 | When working with LLMs, being mindful that the research field is new and changing rapidly is essential. Therefore, when considering using this technology in a production environment, we need to evaluate qualitatively and measure performance against quantitative benchmarks, which may need significant iteration and revision for assessing performance in a way that makes sense in any given production context.
159 |
160 | Even with the small training dataset we used and only three epochs of training, our models already see exciting results. The bigger the model is, the better (and more valuable) its generations become.
161 |
162 |
163 |
164 | | Instruction | 7B | 30B |
165 | | ----------- | -- | --- |
166 | | **Write a poem about capybaras** | Capybaras, the largest rodent in the world,
With their shaggy coats and bushy tails,
They roam the grasslands of South America,
In search of food and water to quench their thirst.
Capybaras are herbivores, and they love to graze,
Eating grasses, fruits, and vegetables, too,
They are social creatures, living in herds,
And they're known for their playful and friendly deeds. | A Capybara's Life
A capybara's life is filled with joy,
As they frolic in the grass and play in the sun.
They're the largest rodents in the world,
With bodies built for swimming and for fun.
They're social creatures, always in a group,
Enjoying each other's company, without a flop.
They love to eat grass and aquatic plants,
A diet that keeps them healthy and well-fed. |
167 | | **Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering.** (From Flan) | No, it is not possible for Geoffrey Hinton to have a conversation with George Washington. This is because George Washington is a historical figure who lived from 1732 to 1799, while Geoffrey Hinton is a living person who was born in 1957. Time travel is not possible, so it is not possible for two people from different eras to have a conversation with each other. | No, it is not possible for Geoffrey Hinton to have a conversation with George Washington. Geoffrey Hinton is a living person who was born in 1947, while George Washington died in 1799. It is not possible for a living person to have a conversation with someone who has been dead for more than 200 years. |
168 | | **Classify this document in one of these categories, "sport", "politics", or "technology".**
**“The LLaMA models are a family of Natural Language Processing LLMs that can be applied to many different tasks.”** | technology | technology |
169 | | **Do aliens invest in stocks?** | No, aliens do not invest in stocks. Stock investing is a human activity that involves buying and selling shares of publicly traded companies in order to profit from changes in their share prices. Aliens do not have access to the same financial markets and systems that humans do, so they would not be able to participate in stock investing.
However, it is possible that aliens could invest in other types of financial instruments, such as bonds, mutual funds, or real estate, if they had access to the appropriate financial markets and systems. | I'm sorry, but as an AI language model, I don't have access to any information about aliens or their investment habits. It's possible that some alien civilizations may invest in stocks, but it's also possible that they may have different investment strategies or may not invest in stocks at all. It's difficult for me to give a definitive answer to this question without more information or data. It's possible that some alien civilizations may invest in stocks, but it's also possible that they may have different investment strategies. |
170 | | **To separate egg whites from the yolk using a water bottle, you should…** (from PIQA) | To separate egg whites from the yolk using a water bottle, you should follow these steps:
1. Crack the egg into a small bowl and separate the yolk from the egg white.
2. Fill the water bottle with cold water.
3. Place the egg white into the water bottle.
4. Shake the water bottle vigorously to separate the egg white from the yolk.
5. Remove the egg white from the water bottle and discard the yolk.
6. Repeat the process for the remaining eggs. | To separate egg whites from the yolk using a water bottle, you should follow these steps:
1. Start by cracking an egg into a small bowl or cup.
2. Hold the water bottle upside down over the bowl or cup.
3. Slowly squeeze the water bottle, allowing the water to flow into the bowl or cup.
4. As the water flows into the bowl or cup, it will push the egg yolk towards the bottom of the bowl or cup. |
171 | | **Best way to pierce ears** (from PIQA)| The best way to pierce ears is to use a sterilized, hygienic piercing gun or needle. It is important to follow the instructions provided by the piercing professional and to clean the area thoroughly before and after the piercing. It is also important to use a high-quality, non-allergenic jewelry and to take proper care of the piercing by cleaning it regularly and avoiding contact with water while the piercing is healing. It is recommended to wait at least 6-8 weeks for the piercing to heal completely before changing the jewelry. | The safest and most effective way to pierce your ears is to have it done by a professional piercer at a reputable piercing studio. A professional piercer will use sterilized equipment and follow strict hygienic procedures to minimize the risk of infection or other complications. They will also be able to advise you on the best placement and type of jewelry for your piercing, as well as provide aftercare instructions to ensure proper healing. |
172 | | **What's the best way to drink water if you are juggling three balls?** | If you are juggling three balls, the best way to drink water is to use a straw. This will allow you to keep your hands free to continue juggling, while also ensuring that you don't spill any water on yourself or your juggling equipment. Additionally, it's important to drink small sips of water, rather than gulping down large amounts at once. This will help you to stay hydrated without interfering with your juggling routine. | If you are juggling three balls, the best way to drink water would be to take a break from juggling, set the balls down on a safe surface, and take a sip of water from a cup or bottle. Juggling and drinking at the same time can be difficult and may result in spilling water or dropping the balls. It's important to stay hydrated while juggling, but it's best to do so in a way that doesn't interfere with your juggling performance. |
173 |
174 |
175 |
176 | These results show multiple things: first off, the models are good at replying to questions and instructions. They all come up with coherent text that appears correct. The 7B model is more prone to generating incorrect stuff, such as getting the wrong year of birth of someone or suggesting something that might be unsafe. We have color coded some replies to highlight possible good (green), wrong (red), or unclear (orange) answers.
177 |
178 | Summary
179 | -------
180 | You can use the above Metaflow workflow as a generic template for fine-tuning models available on HuggingFace. Besides the Alpaca instruction dataset we used to fine-tune various sizes of LLMs, you can apply the same approach to fine-tune models with your instructions to optimize models for your use cases. We are happy to help you get started with this template and Metaflow in general. Join us and thousands of other ML engineers, AI enthusiasts, and data scientists on [Metaflow Community Slack](http://slack.outerbounds.co/)!
181 |
182 | Acknowledgments
183 | ---------------
184 | We would like to thank Mert Yuksekgonul, Mirac Suzgun, Patrick John Chia, Silvia Terragni, Giuseppe Attanasio, and Jacopo Tagliabue for feedback and suggestions on a previous version of this blog post.
185 |
186 |
--------------------------------------------------------------------------------
/data/test-data/ob/blog/metaflow-fast-data.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Fast Data: Loading Tables From S3 At Lightning Speed"
3 | date: "2023-05-05T20:13:16"
4 | tags: ["data", "MLOps"]
5 | keywords: ["data", "MLOps"]
6 | categories: ["Data", "Best Practices"]
7 | authors: [obteam]
8 | description: "Load and process tabular data at lightning speed using Apache Arrow and Metaflow."
9 | slug: "metaflow-fast-data"
10 | image: "/blog/cover/metaflow-fast-data.png"
11 | ogImage: "/blog/og/metaflow-fast-data.png"
12 | ---
13 |
14 | ---
15 |
16 | *This article answers a common question: How can I load tabular data quickly from S3 into a Metaflow workflow and process it efficiently? Thanks to Metaflow’s straightforward support for vertical scalability, its high-throughput S3 client, and high-performance tooling powered by Apache Arrow, you can handle surprisingly large amounts of data, surprisingly fast.*
17 |
18 | *You can execute all benchmarks and examples with [the code in this repository](https://github.com/outerbounds/fast-data-blog/).*
19 |
20 | ---
21 |
22 | Ever since the early days of Metaflow at Netflix, we have been asking a fundamental question from data scientists and other users of Metaflow: What is the greatest source of friction in your day to day work? Maybe unsurprisingly, the majority of answers relate to data: Finding the right data, accessing data, transforming data, and utilizing data in various ways.
23 |
24 | In contrast to traditional software, which has a limited surface area with the outside world, data exposes ML/AI-powered applications to constantly changing real-world entropy and complexity. It would be unreasonable to expect that we can address the diverse use cases with a single nifty solution. Needs differ greatly, say, between computer vision, NLP, large tabular data, and small datasets.
25 |
26 | We can support a wide variety of use cases by working well with a broad set of open-source data tools and efficient infrastructure. This approach is illustrated by our view of [the full stack of ML infrastructure](https://outerbounds.com/docs/infra-stack/) which acknowledges the need of providing plenty of support for data:
27 |
28 | 
29 |
30 | Metaflow doesn’t provide magical new abstractions or paradigms for processing data, but it provides foundational, human-friendly infrastructure that works smoothly with the tools of your choice. Data scientists certainly care about
31 | data but data tooling should just work without hassle. Often simpler is better.
32 |
33 | This article focuses on a common use case of data: tabular, structured datasets. This use case is particularly interesting, as tooling for it has evolved rapidly over the past few years.
34 |
35 |
36 | A fast path for tabular data
37 | ----------------------------
38 |
39 | All modern data warehousing solutions either store data natively as Parquet-encoded files (like various versions of [the cloud-based data lake](https://towardsdatascience.com/an-aws-data-lake-with-s3-explained-c67c5f161db3) e.g. powered by [AWS Athena](https://aws.amazon.com/athena/) or [Spark](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark.html)), or they allow exporting/unloading data as Parquet files (e.g. [Snowflake](https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet) or [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html)).
40 |
41 | Conveniently, these systems allow you to create materialized views which can be processed outside the warehouse. You can use SQL and [the Create-Table-As-Select (CTAS) pattern](https://docs.aws.amazon.com/athena/latest/ug/ctas.html) to query even petabytes of data, extracting the results for downstream processing in Metaflow.
42 |
43 | Once you have Parquet files in S3, you can hit the fast path:
44 |
45 | 
46 |
47 | 1. You can load data from S3 directly to memory *very quickly*, at tens of gigabits per second or more, using [Metaflow’s optimized S3 client, `metaflow.S3`](https://docs.metaflow.org/scaling/data).
48 |
49 | 2. Once in memory, Parquet data can be decoded efficiently using [Apache Arrow](https://arrow.apache.org/).
50 |
51 | 3. The in-memory tables produced by Arrow are interoperable with various modern data tools, so you can use the data in various ways without making additional copies, which speeds up processing and avoids unnecessary memory overhead.
52 |
53 | This pattern is becoming increasingly popular, thanks to the maturation of Apache Arrow itself and versatile tools around it. The excitement is further motivated by the realization that a single medium-size EC2 instance can handle an amount of data that used to require a Hadoop or Spark cluster just a decade ago - this time with much fewer operational headaches, and at a much lower cost.
54 |
55 | The pattern is still new, so not everyone is fully aware of how well and fast it works - under suitable conditions. To showcase this, we first focus on the left side of the diagram, addressing a common myth that loading data from S3 is slow. After this, we focus on the right side, demonstrating how to use the data with various high-performance tools efficiently.
56 |
57 | S3 is fast (when used correctly)
58 | --------------------------------
59 |
60 | Occasionally, we are asked how to cache data from S3 on local disk, instance volume, or a distributed file system like EFS to make loading and processing data faster. Counterintuitively, loading data from S3 can be much faster than local disk, so there's no need to add extra layers of complexity.
61 |
62 | The question likely stems from first-hand experiences of S3 being slow, which can easily happen if you don’t have a setup that leverages its strengths. This benchmark illustrates the point:
63 |
64 | 
65 |
66 | The bars refer to a Metaflow task that uses `metaflow.S3` to download the dataset, running on EC2 with AWS Batch [using the @batch decorator](https://docs.metaflow.org/scaling/remote-tasks/aws-batch), comparing four different configurations:
67 |
68 | - **Small (disk)**: 8GB RAM, 2 CPU cores, a c4.2xlarge instance with EBS.
69 | - **Medium (disk)**: 8GB RAM, 8 CPU cores, a c4.8xlarge instance with EBS.
70 | - **Medium (tmpfs)**: 30GB RAM, 8 CPU cores, a c4.8xlarge utilizing Metaflow’s new `tmpfs` feature.
71 | - **Large (tmpfs)**: 200GB RAM, 32 CPU cores, an m5n.24xlarge utilizing Metaflow’s new `tmpfs` feature.
72 |
73 | There is a 35x fold speed difference between the slowest and the fastest configuration! Understandably, if your tasks ran under the first configuration, you would conclude that S3 is slow.
74 |
75 | ### Maximizing the S3 performance
76 |
77 | A key observation is that the download throughput of S3 is a combination of multiple factors. In order to achieve the maximum throughput, pay attention to the following dimensions:
78 |
79 | - **Same region**: Make sure the EC2 instances hosting the tasks are located in the same region as the S3 bucket you are loading data from.
80 | - **File layout**: You need to download multiple files in parallel using e.g. [metaflow.S3.get_many](https://docs.metaflow.org/api/S3#S3.get_many). The files should be around 0.1-1GB each. Fortunately, it is easy to produce partitioned outputs like this with many query engines.
81 | - **Instance size**: Larger EC2 instances boost higher number of CPU cores, network throughput, and memory. When using e.g. `@batch` with Metaflow, instances autoscale based on demand, so faster processing times can lead to lower costs, despite the higher unit costs of larger instances.
82 | - **Data fits in RAM**: Crucially, loading data from S3 directly to memory is faster than loading data from S3 to an instance volume. If data doesn’t fit in memory, performance can be very bad due to slow local disk IO.
83 |
84 | The importance of the last point is demonstrated by the two medium bars: By increasing the amount of memory available for the tasks - simply by setting `@resources(memory=32000)` - we gain an eightfold increase in S3 throughput!
85 |
86 | The highest bar demonstrated the massive horsepower available on the largest instances: By using `metaflow.S3`, we can reach a beast-mode throughput of 32 GBit/s, which can easily beat local SSDs on laptops. In other words, loading data from S3 to an EC2 instance can be faster than loading data locally on a laptop, or loading it from an EBS volume, an ephemeral instance disk, or EFS.
87 |
88 | New feature: tmpfs support for Batch and Kubernetes
89 | ---------------------------------------------------
90 |
91 | We have been using the above fast data pattern successfully for years to power Metaflow workflows handling terabytes of data. However, this far, there has been a small gotcha.
92 |
93 | While allocating enough memory through `@resources` should guarantee that data can be downloaded quickly, we still rely on the local filesystem to back the files nominally, although the files never hit the disk thanks to caching. Sadly, it is not possible to allocate disk space for tasks on the fly while using AWS Batch. Doing it requires a bit more tedious change in [the instance’s launch template](https://docs.aws.amazon.com/batch/latest/userguide/launch-templates.html).
94 |
95 | To address this issue, recently we implemented support for memory-based `tmpfs` filesystem on Batch and Kubernetes in Metaflow. You can use the feature to create an ephemeral filesystem backed by memory on the fly, without having to change anything on the infrastructure side.
96 |
97 | We made sure that the `metaflow.S3` client is aware of the `tmpfs` volume, so it will automatically use it to speed up downloads when you enable it. To benefit from the new feature, simply add
98 |
99 | ```python
100 | @batch(use_tmpfs=True)
101 | ```
102 |
103 | for your AWS Batch workloads or
104 |
105 | ```python
106 | @kubernetes(use_tmpfs=True)
107 | ```
108 | for Kubernetes. There are a few additional features related to `tmpfs` which you can [read more about in the Metaflow documentation](https://docs.metaflow.org/scaling/data#using-metaflows3-for-in-memory-processing).
109 |
110 |
111 | From cloud to table
112 | -------------------
113 |
114 | As shown above, we can load bytes to memory very quickly. To actually use the data, we must also decode the Parquet files to [an in-memory `pyarrow.Table` object](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html). When taking the decoding time into account, the differences are even more striking:
115 |
116 | 
117 |
118 | The configurations are the same as in the first chart above. We added a top-of-the-line M2 Macbook Pro (the purple bar) as a comparison point. The Macbook loads data from a local SSD, not from S3 as the instances do.
119 |
120 | As shown above, a powerful Macbook beats small and medium-size instances when the instances are using a local disk, but even a medium-size instance beats the Macbook when using in-memory processing. In this case, a large instance can load a table about 8x faster than a Macbook.
121 |
122 | This result is a combination of multiple factors: The larger instances have more CPU cores, more memory, and more IO bandwidth available than a Macbook. Naturally, you must utilize the resources efficiently to gain the maximum performance: Use [metaflow.S3.get_many](https://docs.metaflow.org/api/S3#S3.get_many) to maximize network throughput and multithreaded decoding to benefit from all the available CPU cores, as exemplified by [the `table_loader.py` utility module](https://github.com/outerbounds/fast-data-blog/blob/main/table_loader.py).
123 |
124 |
125 | From table to results
126 | ---------------------
127 |
128 | With [a `pyarrow.Table` object](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html) in memory, the possibilities are many! We highlight three common use cases below.
129 |
130 | Crucially, all the libraries listed below support zero-copy instantiation from a `pyarrow.Table`, so you don’t have to reserve a large amount of extra `@resources(memory=)` just to handle extra copies or inefficient internal representations. Ballooning memory overhead used to be a problem in the past with older Pandas, prior to Arrow.
131 |
132 |
133 | You can code along with the following sections using [this repository](https://github.com/outerbounds/fast-data-blog/).
134 |
135 | ### Dataframes
136 |
137 | Moving to Pandas is trivial and fast:
138 |
139 | ```python
140 | df = table.to_pandas()
141 | ```
142 |
143 | You can add a `zero_copy_only=True` flag to make sure that you will get warned if the conversion requires extra copies of data. Since [the Pandas 2.0 release](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i) that came out just a month ago, Pandas is fully compatible with Arrow data types, so working with Arrow and Pandas should be smooth sailing.
144 |
145 | While Pandas provides a familiar API, its operations are not always the fastest. You can often enjoy a higher performance by using [Polars - a high-performance dataframe library](https://pola.rs):
146 |
147 | ```python
148 | import polars as pl
149 | df = pl.from_arrow(table)
150 | ```
151 |
152 | ### SQL
153 |
154 | Sometimes it is more convenient to use SQL for data processing than dataframe APIs. For instance, you can follow this
155 | pattern to enable clear division of responsibilities between data engineers and data scientists:
156 |
157 | 
158 |
159 | Diagram adapted from the book, Effective Data Science Infrastructure
160 |
161 |
162 | In this scenario, data engineers are responsible for maintaining company-wide, authoritative fact tables that individual projects can rely on. Projects may create their own interpretations of data through their own lightweight ETL process e.g. for feature transformations.
163 |
164 | The project-specific ETL can run within their Metaflow workflows using [a small but surprisingly powerful in-process database, DuckDB](https://duckdb.org/). A benefit of this approach is that data scientists can own and iterate on their specific ETL very quickly, even testing it locally on their workstations.
165 |
166 | Using this pattern is not harder than using a dataframe:
167 |
168 | ```python
169 | import duckdb
170 | query = "SELECT * FROM arrow_table"
171 | relation = duckdb.arrow(arrow_table)
172 | df = relation.query('arrow_table', query).to_df()
173 | ```
174 |
175 | To familiarize with using DuckDB and Metaflow together in a machine learning context, you can follow along with this [recommender systems tutorial](https://outerbounds.com/docs/recsys-tutorial-L2/).
176 |
177 | ### Interfacing with ML libraries
178 |
179 | Many modern ML libraries are well optimized when it comes to utilizing CPU and GPU resources. However, to benefit
180 | from high-performance training and inferencing, you need to be able to feed data to the model fast.
181 |
182 | Luckily, Arrow data is [readily convertible to Pandas and NumPy](https://arrow.apache.org/docs/python/numpy.html) which are supported by all major ML libraries. [Through NumPy, you can also convert data to various tensor objects](https://outerbounds.com/docs/python-tabular-data-structures/).
183 |
184 | To illustrate the pattern, [we included a simple ML example](https://github.com/outerbounds/fast-data-blog/blob/main/fast_data_modeling.py) that shows how to convert an Arrow table to a Pandas dataframe, and feed this object to [a LightGBM model](https://lightgbm.readthedocs.io/). The flow trains a model to predict a target variable based on 300 finance instruments.
185 |
186 | You can run the code easily by yourself and [visualize the results through a Metaflow card](https://docs.metaflow.org/metaflow/visualizing-results), [like this one](https://cards.outerbounds.dev/index.html?id=pw-1347686565%2F0e5a2797fce749c5):
187 |
188 |
189 |
190 | Summary
191 | -------
192 |
193 | This post outlined patterns that you can use to load tabular data from S3 to your Metaflow workflows quickly and process it in various ways using Arrow-compatible libraries.
194 |
195 | In future posts, we will dive deeper into related topics around data: How to use data engineering tools like `dbt` together with Metaflow, how to handle large datasets in a horizontally scalable manner using the patterns highlighted here, and how to make sure that the pattern comply with your data governance policies.
196 |
197 | You can test patterns highlighted here in [the Metaflow sandbox](https://outerbounds.com/sandbox). If you have any
198 | feedback, questions, or other thoughts around data, [join us and thousands of other data scientists and engineers on Metaflow Slack](http://slack.outerbounds.co)!
199 |
200 | ---
201 |
202 | PS. If open-source data topics pique your interest, join [our upcoming event on The Open-Source Modern Data Stack on June 7th.](https://www.eventbrite.com/e/the-open-source-modern-data-stack-tickets-630584333717)
203 |
204 |
205 |
206 | 
207 |
208 |
--------------------------------------------------------------------------------
/data/test-data/ob/docs/data-science/compute/use_multiple_cpu_cores.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Use Multiple CPU Cores
3 | slug: /docs/use-multiple-cpu-cores
4 | tags: [versioning]
5 | sidebar_label: 🍃 Use Multiple Cores
6 | id: use-multiple-cpu-cores
7 | pagination_next: null
8 | pagination_prev: null
9 | description: How can I use multiple CPU cores inside of Metaflow tasks?
10 | category: data science
11 | hide_table_of_contents: false
12 | ---
13 |
14 |
15 |
16 |
17 | ## Question
18 | How can I use multiple CPU cores within Metaflow tasks?
19 |
20 | ## Answer
21 | You can use multiple cores like you normally would in your Python code in a Metaflow task.
22 | For example, you can use Python's [concurrent.future](https://docs.python.org/3/library/concurrent.futures.html) module to launch multiple processes and threads.
23 |
24 |
25 |
26 | ### Write Flow with Concurrency in a Task
27 |
28 |
29 |
30 |
31 | This flow shows how to:
32 | - Call the `do_task` function four times on different cores using Python's built-in `concurrent.futures.ThreadPoolExecutor`.
33 | - Print the total time elapsed to demonstrate the tasks ran in parallel.
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 | ```py title="use_multiple_cpu_cores.py"
42 | from metaflow import FlowSpec, step, Parameter
43 | from concurrent import futures
44 | import time
45 |
46 | def do_task(name):
47 | time.sleep(3) # a long-running task
48 | return name
49 |
50 | class MulticoreFlow(FlowSpec):
51 |
52 | #highlight-next-line
53 | num_cores = Parameter('num-cores', default=4)
54 |
55 | @step
56 | def start(self):
57 | threads = [
58 | "thread_%s" % i
59 | for i in range(self.num_cores)
60 | ]
61 | t0 = time.time()
62 | #highlight-start
63 | with futures.ThreadPoolExecutor(
64 | max_workers = self.num_cores
65 | ) as exe:
66 | self.messages = [
67 | msg for msg in exe.map(do_task, threads)
68 | ]
69 | #highlight-end
70 | self.time_elapsed = time.time() - t0
71 | self.next(self.end)
72 |
73 | @step
74 | def end(self):
75 | print_msg = "All tasks completed in %.3fs"
76 | print(print_msg % self.time_elapsed)
77 | [print(msg) for msg in self.messages]
78 |
79 | if __name__ == '__main__':
80 | MulticoreFlow()
81 | ```
82 |
83 |
84 |
85 | ### Run Flow
86 |
87 |
88 |
89 | ```bash
90 | python use_multiple_cpu_cores.py run
91 | ```
92 |
93 |
94 |
95 | ```
96 | Workflow starting (run-id 1663278003721691):
97 | [1663278003721691/start/1 (pid 47937)] Task is starting.
98 | [1663278003721691/start/1 (pid 47937)] Task finished successfully.
99 | [1663278003721691/end/2 (pid 47941)] Task is starting.
100 | [1663278003721691/end/2 (pid 47941)] All tasks completed in 3.011s
101 | [1663278003721691/end/2 (pid 47941)] thread_0
102 | [1663278003721691/end/2 (pid 47941)] thread_1
103 | [1663278003721691/end/2 (pid 47941)] thread_2
104 | [1663278003721691/end/2 (pid 47941)] thread_3
105 | [1663278003721691/end/2 (pid 47941)] Task finished successfully.
106 | Done!
107 | ```
108 |
109 |
110 |
111 |
112 |
113 |
114 | ## Further Reading
115 | * [Parallelization Over Multiple Cores](https://docs.metaflow.org/metaflow/scaling-out-and-up/effortless-scaling-with-kubernetes#parallelization-over-multiple-cores)
116 | * [How to Scale Model Training and Tuning to GPU](/docs/scale-model-training-and-tuning/)
117 |
--------------------------------------------------------------------------------
/data/test-data/ob/docs/data-science/orchestration/dags_for_data_science.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Why Should I Care About DAGs and Workflows in Data Science?
3 | slug: /docs/dags-in-data-science
4 | tags: [dag, orchestration]
5 | sidebar_label: 🌳 Why DAGs?
6 | id: dags-in-data-science
7 | pagination_next: null
8 | pagination_prev: null
9 | description: The complexity of machine learning and data science workflows can become complicated quickly. To avoid a jungle of data pipelines and models, we need organizational principles for workflows, and directed acyclic graphs have become a standard for this
10 | category: data science
11 | hide_table_of_contents: false
12 | ---
13 |
14 |
15 |
16 |
17 | ## DAGs and Data Science Workflows
18 |
19 | The complexity of machine learning and data science workflows can become complicated very quickly. To avoid a jungle of data pipelines and models, we need organizational principles for our workflows, and directed acyclic graphs aka DAGs have become a standard for this:
20 |
21 | 
22 | Common data science DAG
23 |
24 | DAGs don’t only exist to tame complexity in the abstract, but are practically useful for several reasons:
25 |
26 | They provide a common vocabulary of steps (the nodes) and transitions between them (the edges), which make it easier to write and understand nontrivial applications that are structured as DAGs.
27 | They both encourage and force us to be explicit about the order in which our workflows are executed. This is particularly useful when the order is anything more complicated than a linear flow, such as you would see in a notebook. Being explicit about the order of operations helps us to manage what could be a jungle of models and data pipelines.
28 | Using DAGs, we can signal when the order of operations doesn’t matter, such as when training independent models. We can parallelize these operations automatically, which is essential for performant code.
29 |
30 | In summary, it is helpful to view DAGs as a language, not a programming language per se, but rather a formal construct for human-to-human communication. With DAGs, we can speak about complex sequences of operations in a concise and understandable manner.
31 |
32 | ## What exactly is DAG?
33 |
34 | It is worth defining explicitly what a DAG is: Directed Acyclic Graph may sound like a mouthful but you can think of DAGs as follows:
35 |
36 | * A graph is a set of nodes connected by edges
37 | * A graph is directed if each edge is an arrow (that is, pointing from one node to another)
38 | * Acyclic means there are no cycles
39 |
40 | The nodes of the DAG of a data science or machine learning workflow are the steps in your workflow, such as (but not limited to) getting your data, training a model, or choosing the best model, as in the DAG above. In the case of Metaflow, a step can contain any Python code - like a cell in a notebook. The edges tell you which step(s) follow from other steps and this is why they need to be directed. The reason we don’t want cycles may now be clear: we don’t want our pipelines to end up in infinite loops!
41 |
42 | In an industrial setting, DAGs can be and often are far more complicated than the toy case above:
43 |
44 | 
45 | DAGs expand quickly in data science workflows
46 |
47 | To be explicit here, DAGs are important in order to tame the complexity of machine learning workflows. There is an important corollary: not all data science workflows are DAGs, rather, DAGs are a particular type of workflow.
48 |
49 |
50 | ## Orchestrating Workflows
51 |
52 | Once we have a DAG, in order to perform our computation, we require a workflow orchestrator. The workflow orchestrator needs to perform a seemingly simple task: given a workflow or DAG definition, execute the steps defined by the graph in order. The orchestrator walks through the graph, sending each step to a separate compute layer that is responsible for executing the code contained in a step. For data-intense, distributed workflows a compute layer like Spark or Kubernetes can be used, whereas local processes suffice for prototyping.
53 | In contrast to prototyping, production workflows need to run automatically and reliably without human intervention. Workflow orchestrators play key roles in ensuring that production workflows, such as those training models and producing predictions regularly, execute in a scalable and highly available manner. Hence it makes sense to use a battle-hardened, proven system in production such as:
54 | * [Argo](https://argoproj.github.io/), a modern orchestrator that runs natively on Kubernetes
55 | * [Airflow](https://airflow.apache.org/), a popular open-source workflow orchestrator
56 | * or a low-maintenance managed solution such as [Google Cloud Composer](https://cloud.google.com/composer) or [AWS Step Functions](https://aws.amazon.com/step-functions/?step-functions.sort-by=item.additionalFields.postDateTime&step-functions.sort-order=desc)
57 |
58 | ## Developing Workflows
59 | Earlier, data science applications were often developed as independent modules which were glued together as a DAG afterwards, sometimes even by a separate team. As a result, it was hard to ensure that the whole application worked correctly end to end. Debugging workflow failures was often a painful experience as issues couldn’t be easily reproduced and fixed locally.
60 | A better approach is to consider the whole workflow as a key deliverable of a data science project, which is developed, tested, and deployed as a functional unit. When using a local orchestrator like the one provided by Metaflow, you can rapidly prototype end-to-end workflows almost as easily as how you develop code in a notebook. After the workflow works locally, it can be deployed to a production orchestrator without any changes in the code. After all, a data science workflow is just a DAG of steps that can be orchestrated by many different systems.
61 |
62 |
63 |
64 | ## How do I?
65 |
66 | [Pass artifacts between steps in Metaflow flows](/docs/pass-artifacts-between-steps)
67 |
68 | ---
69 |
70 | [Schedule Metaflow flows on AWS Step Functions](/docs/schedule-flow-on-step-functions)
71 |
72 | ---
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
1 | name: rag-demo
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - python
6 | - pip
7 | - pandas
8 | - metaflow
9 | - matplotlib
10 | - seaborn
11 | - jupyter
12 | - pip:
13 | - sentence-transformers
14 | - openai
15 | - lancedb
16 | - python-frontmatter
17 | - gitpython
18 | - git+https://github.com/outerbounds/rag-demo
19 | - python-slugify
20 | - tldextract
21 | - llama-index==0.8.0
22 | - llama-cpp-python==0.1.77
23 | - sentence-transformers
24 | - openai
25 | - llama-index
26 | - lancedb
27 | - pinecone-client[grpc]
28 | - streamlit
29 | # - fashion-clip
--------------------------------------------------------------------------------
/flows/config/repo_params.py:
--------------------------------------------------------------------------------
1 | SAMPLE_OSS_MARKDOWN_REPOS = [
2 | {
3 | "deployment_url": "docs.metaflow.org",
4 | "repository_path": "https://github.com/Netflix/metaflow-docs",
5 | "repository_ref": "master",
6 | "base_search_path": "docs",
7 | "exclude_paths": ["docs/v"],
8 | "exclude_files": ["README.md", "README"],
9 | },
10 | {
11 | "deployment_url": "https://huggingface.co/docs/accelerate",
12 | "repository_path": "https://github.com/huggingface/accelerate",
13 | "repository_ref": "main",
14 | "base_search_path": "docs",
15 | "exclude_paths": [],
16 | "exclude_files": [],
17 | }
18 | ]
--------------------------------------------------------------------------------
/flows/data_table_processor.py:
--------------------------------------------------------------------------------
1 | from metaflow import FlowSpec, step, Parameter, card, current, Flow, trigger_on_finish
2 | from metaflow.cards import Image, Markdown
3 | from matplotlib import pyplot as plt
4 | import seaborn as sns
5 |
6 | sns.set_style("dark")
7 | COLORS = {
8 | "purple": "#98A1E1",
9 | "light-purple": "#DADEFB",
10 | "gold": "#F0C054"
11 | }
12 |
13 |
14 | @trigger_on_finish(flow='MarkdownChunker')
15 | class DataTableProcessor(FlowSpec):
16 |
17 | save_processed_df = Parameter(
18 | "save_processed_df",
19 | help="Whether to save the processed dataframe to the run.",
20 | default=True,
21 | type=bool,
22 | )
23 |
24 | data_dir = Parameter(
25 | "data_dir",
26 | help="The directory to save the processed dataframe to.",
27 | default="data",
28 | type=str,
29 | )
30 |
31 | parent_flow = Parameter(
32 | "parent_flow",
33 | help="The flow id of the parent flow to process.",
34 | default='MarkdownChunker',
35 | type=str,
36 | )
37 |
38 | n_bins = Parameter(
39 | "n_bins",
40 | help="The number of bins to use in the histogram.",
41 | default=100,
42 | type=int,
43 | )
44 |
45 | word_count_threshold = Parameter(
46 | "word_count_threshold",
47 | help="The word count threshold to use in the histogram.",
48 | default=10,
49 | type=int,
50 | )
51 |
52 | char_count_threshold = Parameter(
53 | "char_count_threshold",
54 | help="The char count threshold to use in processing.",
55 | default=25,
56 | type=int,
57 | )
58 |
59 | def plot_char_word_histogram(self, char_count_threshold=0, word_count_threshold=0, _df=None, title="", ):
60 | fig, ax = plt.subplots(1, 2, figsize=(12, 4))
61 | ax[0] = _df.char_count.plot.hist(bins=self.n_bins, color=COLORS['purple'], ax=ax[0])
62 | ax[1] = _df.word_count.plot.hist(bins=self.n_bins, color=COLORS['light-purple'], ax=ax[1])
63 | if char_count_threshold > 0 or word_count_threshold > 0:
64 | ax[0].set_xlabel("Filtered character count > %d" % char_count_threshold)
65 | ax[1].set_xlabel("Filtered word count > %d" % word_count_threshold)
66 | ax[0].set_xscale("log")
67 | ax[1].set_xscale("log")
68 | ax[0].set_ylabel("Frequency")
69 | ax[0].spines.right.set_visible(False)
70 | ax[0].spines.top.set_visible(False)
71 | ax[1].spines.right.set_visible(False)
72 | ax[1].spines.top.set_visible(False)
73 | fig.suptitle(title, fontsize=24)
74 | fig.tight_layout()
75 | assert fig is not None, "Figure is None, check plot_char_word_histogram."
76 | return fig
77 |
78 | def plot_tld_count(self):
79 | fig, ax = plt.subplots(1, 1, figsize=(12, 4))
80 | self.processed_df.groupby('tld').count()['index'].sort_values(ascending=False).plot.bar(
81 | ax=ax, color=COLORS['gold']
82 | )
83 | fig.suptitle("Top-level domain representation in the dataset", fontsize=24)
84 | ax.set_xticklabels(ax.get_xticklabels(), rotation = 45)
85 | ax.set_xlabel("")
86 | ax.set_ylabel("Frequency")
87 | fig.tight_layout()
88 | assert fig is not None, "Figure is None, check plot_tld_count."
89 | return fig
90 |
91 | @card
92 | @step
93 | def start(self):
94 |
95 | parent_flow = Flow(self.parent_flow)
96 | run = parent_flow.latest_run
97 |
98 | if not run.successful:
99 | print("Skipping processing of unsuccessful run {}.".format(run.id))
100 | self.processed_df = None
101 |
102 | else:
103 |
104 | current.card.append(Markdown(f"""# Processing data table from run {run.id}"""))
105 |
106 | current.card.append(Markdown(f"""## Filtering rows"""))
107 | df = run.data.df
108 | fig = self.plot_char_word_histogram(_df = df, title="Before filtering")
109 | current.card.append(Image.from_matplotlib(fig))
110 |
111 | # Filter out rows with less than N words.
112 | _df = df[df.word_count > self.word_count_threshold]
113 |
114 | # Filter out rows with less than M chars.
115 | _df = _df[_df.char_count > self.char_count_threshold]
116 |
117 | # Feature: Add a column for the TLD.
118 | import tldextract
119 | def get_tld(url):
120 | ext = tldextract.extract(url)
121 | if ext.subdomain != '':
122 | url = '.'.join(ext[:3])
123 | else:
124 | url = '.'.join(ext[1:3])
125 | return "https://" + url
126 | _df['tld'] = _df['page_url'].apply(get_tld)
127 |
128 | # Reindex and keep index in upstream dataframe.
129 | _df.reset_index(inplace=True)
130 | _df.index = range(len(_df))
131 |
132 | fig = self.plot_char_word_histogram(
133 | word_count_threshold=self.word_count_threshold,
134 | char_count_threshold=self.char_count_threshold,
135 | _df = _df, title="After filtering"
136 | )
137 | current.card.append(Image.from_matplotlib(fig))
138 |
139 | ### ADD MORE FILTERS HERE.
140 | print("Filtered dataframe from shape {} to shape {}.".format(
141 | df.shape, _df.shape))
142 | self.processed_df = _df
143 |
144 | # Plot the number of rows per TLD.
145 | fig = self.plot_tld_count()
146 | current.card.append(Image.from_matplotlib(fig))
147 |
148 | ### ADD MORE SUMMARY STATS HERE.
149 |
150 | self.next(self.end)
151 |
152 | @step
153 | def end(self):
154 |
155 | import os
156 |
157 | print("The {} run {} has ended, with a dataframe of shape: {}".format(
158 | current.flow_name, current.run_id,
159 | self.processed_df.shape))
160 | print(
161 | f"""
162 | You can now use the dataframe to do whatever you want.
163 | To load it in a notebook, you can use the following code:
164 |
165 | from metaflow import Flow, namespace
166 | namespace('{current.namespace}')
167 | run = Run('{current.flow_name}/{current.run_id}')
168 | processed_df = run.data.processed_df
169 | print(processed_df.shape)
170 | """
171 | )
172 |
173 | if self.save_processed_df:
174 | self.processed_df_path = os.path.abspath(
175 | os.path.join(self.data_dir, 'processed_df_%s.csv' % current.run_id))
176 | print("Saving processed dataframe to %s" % self.processed_df_path)
177 | self.processed_df.to_csv(self.processed_df_path, index=False)
178 |
179 |
180 | if __name__ == '__main__':
181 | DataTableProcessor()
--------------------------------------------------------------------------------
/flows/lancedb_index.py:
--------------------------------------------------------------------------------
1 | from metaflow import FlowSpec, step, Flow, current
2 |
3 | class LanceDBVectorIndexer(FlowSpec):
4 |
5 | table_name = "test"
6 | embedding_model = "paraphrase-MiniLM-L6-v2"
7 | embedding_target_col_name = "contents"
8 |
9 | def find_processed_df(self):
10 | for run in Flow('DataTableProcessor'):
11 | if run.data.save_processed_df:
12 | print("Found processed df in run: {}".format(run.id))
13 | return run.data.processed_df
14 |
15 | @step
16 | def start(self):
17 | self.next(self.create_index)
18 |
19 | @step
20 | def create_index(self):
21 |
22 | from rag_tools.databases.vector_database import LanceDB
23 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder
24 | import pandas as pd
25 |
26 | # fetch data and embed it
27 | self.data = self.find_processed_df()
28 | encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu")
29 | docs = self.data[self.embedding_target_col_name].tolist()
30 | self.ids = list(range(1, len(docs) + 1))
31 | embeddings = encoder.embed(docs)
32 | self.dimension = len(embeddings[0])
33 |
34 | # put the vectors in the index
35 | db = LanceDB()
36 | db.create_index(self.table_name, embeddings, docs, self.ids)
37 |
38 | self.next(self.end)
39 |
40 | @step
41 | def end(self):
42 |
43 | from rag_tools.databases.vector_database import LanceDB
44 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder
45 |
46 | db = LanceDB()
47 |
48 | # search the index in a test query
49 | K = 3
50 | test_prompt = "aws"
51 | encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu")
52 | self.search_vector = encoder.embed([test_prompt])[0]
53 | self.results = db.vector_search(self.table_name, self.search_vector, k=K)
54 |
55 | print(f"""
56 | Access flow results with:
57 |
58 | from metaflow import Run
59 | run = Run('{current.flow_name}/{current.run_id}')
60 | results = run.data.results
61 |
62 | Resume LanceDBVectorIndexer with:
63 |
64 | from rag_tools.databases.vector_database import LanceDB
65 | db = LanceDB() # default storage location is `../../chatbot.lance`, relative to your cwd.
66 | db.vector_search(table_name, search_vector, k=K)
67 | """)
68 |
69 |
70 | if __name__ == '__main__':
71 | LanceDBVectorIndexer()
--------------------------------------------------------------------------------
/flows/markdown_chunker.py:
--------------------------------------------------------------------------------
1 | from metaflow import FlowSpec, step, current, schedule
2 | from rag_tools.filetypes.markdown import Mixin as MarkdownMixin
3 |
4 | @schedule(weekly=True)
5 | class MarkdownChunker(FlowSpec, MarkdownMixin):
6 |
7 | @step
8 | def start(self):
9 | """
10 | Start the flow.
11 | Try to download the content from the repository.
12 | """
13 |
14 | # see config.py for the definition of repo_params
15 | # it is a list of dictionaries,
16 | # that tell the Markdown tools where to look for content.
17 | # see /notebooks/markdonw_repo_explorer.ipynb for more details.
18 | from config.repo_params import SAMPLE_OSS_MARKDOWN_REPOS
19 |
20 | self.repo_params = SAMPLE_OSS_MARKDOWN_REPOS
21 | self.df = self.load_df_from_repo_list()
22 | self.next(self.end)
23 |
24 | @step
25 | def end(self):
26 | print("The flow has ended, with a dataframe of shape: {}".format(self.df.shape))
27 | print(
28 | f"""
29 | You can now use the dataframe to do whatever you want.
30 | To load it in a notebook, you can use the following code:
31 |
32 | from metaflow import Flow, namespace
33 | namespace('{current.namespace}')
34 | run = Run('{current.flow_name}/{current.run_id}')
35 | df = run.data.df
36 | print(df.shape)
37 | """)
38 |
39 |
40 | if __name__ == "__main__":
41 | MarkdownChunker()
--------------------------------------------------------------------------------
/flows/pinecone_index.py:
--------------------------------------------------------------------------------
1 | from metaflow import FlowSpec, step, Flow, environment
2 | import os
3 |
4 | env_vars = {
5 | 'PINECONE_API_KEY': os.environ['PINECONE_API_KEY'],
6 | 'GCP_ENVIRONMENT': os.environ['GCP_ENVIRONMENT']
7 | }
8 |
9 | class PineconeVectorIndexer(FlowSpec):
10 |
11 | index_name = "test"
12 | embedding_model = "paraphrase-MiniLM-L6-v2"
13 | embedding_target_col_name = "contents"
14 |
15 | def find_processed_df(self):
16 | for run in Flow('DataTableProcessor'):
17 | if run.data.save_processed_df:
18 | print("Found processed df in run: {}".format(run.id))
19 | return run.data.processed_df
20 |
21 | @step
22 | def start(self):
23 | self.next(self.create_index)
24 |
25 | @environment(vars=env_vars)
26 | @step
27 | def create_index(self):
28 |
29 | from rag_tools.databases.vector_database import PineconeDB
30 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder
31 | import pandas as pd
32 |
33 | # fetch data and embed it
34 | self.data = self.find_processed_df()
35 | encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu")
36 | docs = self.data[self.embedding_target_col_name].tolist()
37 | self.ids = list(range(1, len(docs) + 1))
38 | embeddings = encoder.embed(docs)
39 | self.dimension = len(embeddings[0])
40 |
41 | # create the index
42 | db = PineconeDB()
43 | db.create_index(self.index_name, dimension=self.dimension)
44 |
45 | # put the vectors in the index
46 | db.upsert(self.index_name, embeddings, docs, self.ids)
47 |
48 | self.next(self.end)
49 |
50 | @environment(vars=env_vars)
51 | @step
52 | def end(self):
53 |
54 | from rag_tools.databases.vector_database import PineconeDB
55 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder
56 |
57 | # create_index is idempotent
58 | db = PineconeDB()
59 | db.create_index(self.index_name, dimension=self.dimension)
60 |
61 | # search the index in a test query
62 | K = 3
63 | test_prompt = "aws"
64 | encoder = SentenceTransformerEmbedder(self.embedding_model, device="cpu")
65 | self.search_vector = encoder.embed([test_prompt])[0]
66 | self.results = db.vector_search(self.index_name, self.search_vector, k=K).to_dict()
67 |
68 | for result in self.results['matches']:
69 | print("\n\nid: {} - score: {} \n\n{}\n\n".format(result['id'], result['score'], result['metadata']['text']))
70 | print("===============================================")
71 |
72 | print("\n\n Flow is done, check for results in the {} index at https://app.pinecone.io/.".format(self.index_name))
73 |
74 |
75 | if __name__ == '__main__':
76 | PineconeVectorIndexer()
--------------------------------------------------------------------------------
/flows/test_simple_embedding.py:
--------------------------------------------------------------------------------
1 | from rag_tools.databases.vector_database import LanceDB, PineconeDB
2 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder
3 | from metaflow import Flow
4 | import pandas as pd
5 |
6 | def find_processed_df():
7 | for run in Flow('DataTableProcessor'):
8 | if run.data.save_processed_df:
9 | print("Found processed df in run: {}".format(run.id))
10 | return run.data.processed_df
11 |
12 | # fetch data and embed it
13 | data = find_processed_df()
14 | encoder = SentenceTransformerEmbedder("paraphrase-MiniLM-L6-v2", device="cpu")
15 | docs = data["contents"].tolist()
16 | ids = list(range(1, len(docs) + 1))
17 | embeddings = encoder.embed(docs)
18 |
19 | DB = "lance"
20 |
21 | if DB == "lance":
22 | db = LanceDB()
23 | db.create_index("test", embeddings, docs, ids)
24 |
25 | elif DB == "pinecone":
26 | db = PineconeDB()
27 | db.create_index("test", dimension=len(embeddings[0]))
28 | db.upsert("test", embeddings, docs, ids)
29 |
30 | # search the index
31 | K = 3
32 | user_prompt = "aws"
33 | search_vector = encoder.embed([user_prompt])[0]
34 | results = db.vector_search("test", search_vector, k=K) #.to_dict()
35 |
36 | for result in results['matches']:
37 | print("id: {} - distance score: {} \n\n{}\n\n".format(result['id'], result['score'], result['metadata']['text']))
38 |
39 | db.destroy_index("test")
--------------------------------------------------------------------------------
/flows/test_simple_query.py:
--------------------------------------------------------------------------------
1 | import os
2 | from rag_tools.databases.vector_database import LanceDB
3 | from rag_tools.llms.llms_search import ChatGPTWrapper
4 | from rag_tools.embedders.embedder import SentenceTransformerEmbedder
5 |
6 | # import lancedb
7 | # URI = "data/sample-lancedb"
8 | # db = lancedb.connect(URI)
9 |
10 | query = "How do I specify conda dependencies in my flow?"
11 |
12 | # embed with sentence transformer
13 | encoder = SentenceTransformerEmbedder("paraphrase-MiniLM-L6-v2", device="cpu")
14 | search_vector = encoder.embed([query])[0]
15 |
16 | # embed with sentence transformer
17 | best_text = db.vector_search(search_vector, k=2)
18 |
19 | # we build some context for the question
20 | text = "\n\n".join(best_text['text'].tolist())
21 |
22 | # query prompt for chatgpt
23 | prompt = f"Please answer this question {query}\n\nhere's the context you should use:\n\n{text}.\n\nIf the answer is not provided in the context, answer I don't know."
24 |
25 | output = ChatGPTWrapper(os.environ['OPENAI_API_KEY']).sample(prompt)
26 |
27 | print(f"Question: {prompt}")
28 | print()
29 | print(f"Answer: {output}")
--------------------------------------------------------------------------------
/images/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/images/cover.png
--------------------------------------------------------------------------------
/notebooks/llama_index_exploration.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Set your API Key"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# %env OPENAI_API_KEY="
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "# Import dependencies"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 61,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "# built-ins\n",
33 | "from pathlib import Path\n",
34 | "from functools import partial\n",
35 | "import os\n",
36 | "from typing import Tuple, List\n",
37 | "from dataclasses import dataclass\n",
38 | "\n",
39 | "# custom lib\n",
40 | "from rag_tools.repo.ops import ensure_repo_exists_locally, DocumentationExtractor\n",
41 | "\n",
42 | "# third party libs\n",
43 | "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, Document, download_loader\n",
44 | "from llama_index.retrievers import VectorIndexRetriever\n",
45 | "from llama_index.response_synthesizers import get_response_synthesizer\n",
46 | "from llama_index.llms import OpenAI\n",
47 | "from metaflow import Flow\n",
48 | "import pandas as pd\n",
49 | "from IPython.display import display, Markdown"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "# Define utility functions"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 11,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "@dataclass\n",
66 | "class Context:\n",
67 | " def __init__(self, response: str, source_node_ids: List[str]):\n",
68 | " self.response = response\n",
69 | " self.source_node_ids = source_node_ids\n",
70 | "\n",
71 | " def get_link_df(self, meta_df, link_col = 'doc_id'):\n",
72 | " return meta_df[meta_df[link_col].isin(self.source_node_ids)]\n",
73 | "\n",
74 | "def qa_iter(\n",
75 | " question: str, \n",
76 | " index: VectorStoreIndex, \n",
77 | " k:int = 2, \n",
78 | " response_mode:str = 'tree_summarize'\n",
79 | ") -> Context:\n",
80 | " \"Match a question against an index and returns the response.\"\n",
81 | " retriever = VectorIndexRetriever(index=index, similarity_top_k=k)\n",
82 | " response_synthesizer = get_response_synthesizer(response_mode=response_mode)\n",
83 | " query_engine = index.as_query_engine(response_synthesizer=response_synthesizer, retriever=retriever)\n",
84 | " query_res = query_engine.query(question)\n",
85 | " return Context(\n",
86 | " response=query_res.response, source_node_ids=list(query_res.metadata.keys())\n",
87 | " )\n",
88 | "\n",
89 | "dm = lambda x: display(Markdown(x))\n",
90 | "def dmqa(q, a): \n",
91 | " dm(f\"\"\"\n",
92 | "**Question:** {q}\n",
93 | "\n",
94 | "**Answer:** {a}\n",
95 | "\"\"\"\n",
96 | ")\n",
97 | "\n",
98 | "def nb_output_format(question, response, similar_chunk_df):\n",
99 | " dm(f\"#### {question}\")\n",
100 | " dm(f\"**Retrieved Response**\")\n",
101 | " dm(response)\n",
102 | " dm(f\"#### Sources\")\n",
103 | " for _, chunk in similar_chunk_df.iterrows():\n",
104 | " dm(f\"##### [{chunk.header}]({chunk.page_url})\")\n",
105 | " dm(f\"{chunk.contents[:100]}...\")\n",
106 | "\n",
107 | "def get_documents_from_content_section_df(df):\n",
108 | " ids = []; documents = []\n",
109 | " for i, text in enumerate(df.contents):\n",
110 | " doc = Document(text=text, id_=i)\n",
111 | " documents.append(doc)\n",
112 | " ids.append(doc.id_)\n",
113 | " return documents, ids\n",
114 | "\n",
115 | "def generative_search_engine_iter(question, index, meta_df, meta_df_id_col='doc_id'):\n",
116 | " \"Assumes index and df are defined in the global scope\"\n",
117 | " context = qa_iter(question, index)\n",
118 | " similar_chunk_df = meta_df[meta_df[meta_df_id_col].isin(context.source_node_ids)]\n",
119 | " nb_output_format(question, context.response, similar_chunk_df)\n",
120 | "\n",
121 | "def get_documents_from_md_file_paths(fps):\n",
122 | " MarkdownReader = download_loader(\"MarkdownReader\")\n",
123 | " loader = MarkdownReader()\n",
124 | " documents = []\n",
125 | " for fp in fps:\n",
126 | " documents += loader.load_data(file=Path(fp))\n",
127 | " return documents"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "# 🛑 Set variables based on your machine's setup"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 4,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "# these are part of this repo, so you don't need to change\n",
144 | "DATA_DIR='../data'\n",
145 | "LLAMA_INDEX_TUTORIAL_DATA = os.path.join(DATA_DIR, 'llama-index-tutorial')\n",
146 | "\n",
147 | "# this is unique to your machine. where did you clone https://github.com/Netflix/metaflow-docs to?\n",
148 | "YOUR_LOCAL_METAFLOW_DOCS_REPO_PATH = os.path.expanduser(\"~/Dev/metaflow-docs\")"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "# Which GitHub repos?"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 5,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "REPO_PARAMS = [\n",
165 | " {\n",
166 | " \"deployment_url\": \"docs.metaflow.org\",\n",
167 | " \"repository_path\": \"https://github.com/Netflix/metaflow-docs\",\n",
168 | " \"repository_ref\": \"master\",\n",
169 | " \"base_search_path\": \"docs\",\n",
170 | " \"exclude_paths\": [\"docs/v\"],\n",
171 | " \"exclude_files\": [\"README.md\", \"README\"],\n",
172 | " }\n",
173 | "]"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "# 1. Llama Index Warmup"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 63,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "# This cell will trigger llama_index to look for OPENAI_API_KEY in environment variables,\n",
190 | "# then default to downloading a llama 2 model binary locally.\n",
191 | "\n",
192 | "# this thing is looking for a .txt file in the data dir.\n",
193 | "documents = SimpleDirectoryReader(LLAMA_INDEX_TUTORIAL_DATA).load_data()\n",
194 | "\n",
195 | "# Indexing is the first, and most crucial, stage in a RAG workflow.\n",
196 | "# It is the process of converting a set of documents into a vector representation.\n",
197 | "# This vector representation is later used to retrieve relevant documents for a given query.\n",
198 | "service_context = ServiceContext.from_defaults(\n",
199 | " llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.0)\n",
200 | ")\n",
201 | "index = VectorStoreIndex.from_documents(documents, service_context=service_context) "
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": 64,
207 | "metadata": {},
208 | "outputs": [
209 | {
210 | "data": {
211 | "text/markdown": [
212 | "The author worked on writing and programming outside of school before college. They wrote short stories and tried writing programs on an IBM 1401 computer. They also built a microcomputer kit and started programming on it, writing simple games and a word processor."
213 | ],
214 | "text/plain": [
215 | ""
216 | ]
217 | },
218 | "metadata": {},
219 | "output_type": "display_data"
220 | }
221 | ],
222 | "source": [
223 | "# Query engine \"takes in a natural language query, and returns a response, along with reference context retrieved and passed to the LLM.\"\n",
224 | "query_engine = index.as_query_engine()\n",
225 | "response = query_engine.query(\"What did the author do growing up?\")\n",
226 | "dm(response.response)\n",
227 | "\n",
228 | "# Taking too many minutes to run with Llama 2 on my macbook :( \n",
229 | "# Conservatively estimate OpenAI API is ~$1 per dozen end-to-end runs of this notebook."
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {},
235 | "source": [
236 | "## Try a [Llama Hub tool for parsing `.md` files](https://llamahub.ai/l/file-markdown)\n",
237 | "\n",
238 | "This could be used in conjunction or in place of the custom markdown parser used in `./markdown_chunker.py`"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 40,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "from pathlib import Path\n",
248 | "from llama_index import download_loader\n",
249 | "import os"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 41,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "MarkdownReader = download_loader(\"MarkdownReader\")\n",
259 | "loader = MarkdownReader()\n",
260 | "\n",
261 | "# start with a single document\n",
262 | "test_path = os.path.abspath(\"%s/test-data/ob/blog/metaflow-fast-data.md\" % DATA_DIR)\n",
263 | "documents = loader.load_data(file=Path(test_path))"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 65,
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "index = VectorStoreIndex.from_documents(documents, service_context=service_context)"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "## Ask some questions over the index"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 66,
285 | "metadata": {},
286 | "outputs": [
287 | {
288 | "data": {
289 | "text/markdown": [
290 | "\n",
291 | "**Question:** What is the fastest way to load data onto AWS Batch instances using Metaflow?\n",
292 | "\n",
293 | "**Answer:** The fastest way to load data onto AWS Batch instances using Metaflow would be to utilize the data loading capabilities provided by Metaflow itself. Metaflow offers built-in functionality for handling data loading and processing, allowing you to efficiently transfer and process data on AWS Batch instances. By leveraging Metaflow's data loading features, you can optimize the loading process and ensure efficient utilization of AWS Batch resources.\n"
294 | ],
295 | "text/plain": [
296 | ""
297 | ]
298 | },
299 | "metadata": {},
300 | "output_type": "display_data"
301 | }
302 | ],
303 | "source": [
304 | "question = (\n",
305 | " \"What is the fastest way to load data onto AWS Batch instances using Metaflow?\"\n",
306 | ")\n",
307 | "\n",
308 | "query_engine = index.as_query_engine()\n",
309 | "response = query_engine.query(question).response\n",
310 | "\n",
311 | "dmqa(question, response)"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 68,
317 | "metadata": {},
318 | "outputs": [
319 | {
320 | "data": {
321 | "text/markdown": [
322 | "\n",
323 | "**Question:** How does Metaflow use `tmpfs`?\n",
324 | "\n",
325 | "**Answer:** Metaflow uses `tmpfs` to store temporary data during the execution of workflows. `tmpfs` is a temporary file system that resides in memory, which means that the data stored in `tmpfs` is not persisted across system reboots. This makes it ideal for storing temporary data that is only needed during the execution of a workflow and can be discarded afterwards. By using `tmpfs`, Metaflow can achieve faster read and write operations compared to using disk-based storage.\n"
326 | ],
327 | "text/plain": [
328 | ""
329 | ]
330 | },
331 | "metadata": {},
332 | "output_type": "display_data"
333 | }
334 | ],
335 | "source": [
336 | "question = \"How does Metaflow use `tmpfs`?\"\n",
337 | "response = query_engine.query(question).response\n",
338 | "dmqa(question, response)"
339 | ]
340 | },
341 | {
342 | "cell_type": "markdown",
343 | "metadata": {},
344 | "source": [
345 | "# 2. Controlling hallucinations by curating an index\n",
346 | "Here are a few questions we will explore in this section:\n",
347 | "\n",
348 | "**What is the problem with the above workflow?**\n",
349 | "\n",
350 | "In the previous section's index, we created all the vectors from chunks of a [post](https://outerbounds.com/blog/metaflow-fast-data/) specifically about the `tmpfs` feature. [Metaflow docs](https://docs.metaflow.org/) don't contain that much content about `tmpfs` yet.\n",
351 | "\n",
352 | "If we use the Metaflow docs as the source objects to populate the index, and a question is asked to the model about `tmpfs`, how can we know if it is hallucinating it, or referencing an actual piece of content that Metaflow maintainers endorse?\n",
353 | "> Bing Chat AI:\n",
354 | "Give a bunch of links to the content that LLM response was conditioned on.\n",
355 | "\n",
356 | "We will build a simple system like this in the next section. First, let's see the power of understanding the domain of our index, and then move to using it as a way to reference source material in the generated response."
357 | ]
358 | },
359 | {
360 | "cell_type": "markdown",
361 | "metadata": {},
362 | "source": [
363 | "## Index Metaflow docs"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 69,
369 | "metadata": {},
370 | "outputs": [],
371 | "source": [
372 | "file_paths = DocumentationExtractor().filter_files(\n",
373 | " YOUR_LOCAL_METAFLOW_DOCS_REPO_PATH,\n",
374 | " base_search_path = \"docs\",\n",
375 | " exclude_paths = [\"docs/v\"],\n",
376 | " exclude_files = [\"README.md\", \"README\"],\n",
377 | " considered_extensions = [\".md\"],\n",
378 | ")"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 70,
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "file_paths = [abs_path for abs_path, _ in file_paths]"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 71,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "documents = get_documents_from_md_file_paths(file_paths)"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": 72,
402 | "metadata": {},
403 | "outputs": [
404 | {
405 | "name": "stdout",
406 | "output_type": "stream",
407 | "text": [
408 | "Showing sample of 1 out of 500 objects\n"
409 | ]
410 | },
411 | {
412 | "data": {
413 | "text/plain": [
414 | "[Document(id_='b185a5f6-76a2-4095-9523-adb59d3518c9', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='168707b49cebf5424ed48615defbba683a0d7932f8451d263d491040c37aa87f', text='\\n\\nWelcome to Metaflow\\n\\nMetaflow makes it easy to build and manage real-life data science and machine learning projects.\\n\\n\\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]"
415 | ]
416 | },
417 | "execution_count": 72,
418 | "metadata": {},
419 | "output_type": "execute_result"
420 | }
421 | ],
422 | "source": [
423 | "N = 1\n",
424 | "print(\n",
425 | " \"Showing sample of {m} out of {n} {t} objects\".format(\n",
426 | " m=N, n=len(documents), t=type(documents[0])\n",
427 | " )\n",
428 | ")\n",
429 | "documents[:N]"
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": 73,
435 | "metadata": {},
436 | "outputs": [
437 | {
438 | "name": "stdout",
439 | "output_type": "stream",
440 | "text": [
441 | "CPU times: user 582 ms, sys: 187 ms, total: 770 ms\n",
442 | "Wall time: 14.1 s\n"
443 | ]
444 | }
445 | ],
446 | "source": [
447 | "%%time\n",
448 | "index = VectorStoreIndex.from_documents(documents)\n",
449 | "# TODO: Measure times as this thing scales with N documents and larger sized documents."
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "metadata": {},
455 | "source": [
456 | "## Q&A iterations over the Metaflow docs index"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": 74,
462 | "metadata": {},
463 | "outputs": [
464 | {
465 | "data": {
466 | "text/markdown": [
467 | "\n",
468 | "**Question:** What is Metaflow?\n",
469 | "\n",
470 | "**Answer:** Metaflow is a Python library that simplifies the development, deployment, and operation of data-intensive applications, specifically those related to data science and machine learning. It was initially created at Netflix to enhance the efficiency of data scientists working on a range of projects, from traditional statistics to cutting-edge deep learning. Metaflow is an open-source tool released under the Apache License, Version 2.0.\n"
471 | ],
472 | "text/plain": [
473 | ""
474 | ]
475 | },
476 | "metadata": {},
477 | "output_type": "display_data"
478 | }
479 | ],
480 | "source": [
481 | "question = \"What is Metaflow?\"\n",
482 | "context = qa_iter(question, index)\n",
483 | "dmqa(question, context.response)"
484 | ]
485 | },
486 | {
487 | "cell_type": "code",
488 | "execution_count": 75,
489 | "metadata": {},
490 | "outputs": [
491 | {
492 | "data": {
493 | "text/markdown": [
494 | "\n",
495 | "**Question:** How do I specify conda dependencies in my flow?\n",
496 | "\n",
497 | "**Answer:** You can specify conda dependencies in your flow using the `@conda_base` and `@conda` decorators. The `@conda_base` decorator is used at the flow level to specify explicit library dependencies, python version, and whether to exclude all steps from executing within a conda environment. The `@conda` decorator is used at the step level to update the explicit library dependencies, python version, and conda environment exclusion as specified by the `@conda_base` decorator. By using these decorators, you can define the conda environment for each step in your flow. Additionally, you can add an explicit dependency on a specific module by using the `@conda` decorator in the corresponding step.\n"
498 | ],
499 | "text/plain": [
500 | ""
501 | ]
502 | },
503 | "metadata": {},
504 | "output_type": "display_data"
505 | }
506 | ],
507 | "source": [
508 | "question = \"How do I specify conda dependencies in my flow?\"\n",
509 | "context = qa_iter(question, index)\n",
510 | "dmqa(question, context.response)"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "metadata": {},
516 | "source": [
517 | "### Do Metaflow docs know about `tmpfs` though?"
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": 76,
523 | "metadata": {},
524 | "outputs": [
525 | {
526 | "data": {
527 | "text/markdown": [
528 | "\n",
529 | "**Question:** How does Metaflow use `tmpfs`?\n",
530 | "\n",
531 | "**Answer:** Metaflow does not use `tmpfs` based on the given information.\n"
532 | ],
533 | "text/plain": [
534 | ""
535 | ]
536 | },
537 | "metadata": {},
538 | "output_type": "display_data"
539 | }
540 | ],
541 | "source": [
542 | "question = \"How does Metaflow use `tmpfs`?\"\n",
543 | "dmqa(question, qa_iter(question, index).response)"
544 | ]
545 | },
546 | {
547 | "cell_type": "markdown",
548 | "metadata": {},
549 | "source": [
550 | "### Adding specific knowledge to the index\n",
551 | "\n",
552 | "As of August, 2023, the Metaflow documentation do not have much writing about `tmpfs`, so this makes sense.\n",
553 | "\n",
554 | "How can we add [Outerbounds blog post](https://outerbounds.com/blog/metaflow-fast-data/) that announced the `tmpfs` and Metaflow integration to the index, to give the model the context it needs to answer this question?\n",
555 | "\n",
556 | "Let's create an index that combines the one we saw earlier for the `tmpfs` blog post with the one we just created for Metaflow docs."
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 78,
562 | "metadata": {},
563 | "outputs": [],
564 | "source": [
565 | "fast_data_file_path = os.path.abspath('%s/test-data/ob/blog/metaflow-fast-data.md' % DATA_DIR)\n",
566 | "\n",
567 | "# combining the document set\n",
568 | "fast_data_doc = get_documents_from_md_file_paths([fast_data_file_path])\n",
569 | "index_fast_data_post = VectorStoreIndex.from_documents(fast_data_doc)"
570 | ]
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": 79,
575 | "metadata": {},
576 | "outputs": [
577 | {
578 | "data": {
579 | "text/markdown": [
580 | "\n",
581 | "**Question:** How does Metaflow use `tmpfs`?\n",
582 | "\n",
583 | "**Answer:** Metaflow recently implemented support for memory-based `tmpfs` filesystem on AWS Batch and Kubernetes. This feature allows users to create an ephemeral filesystem backed by memory on the fly, without making any changes to the infrastructure. By enabling this feature using the `@batch(use_tmpfs=True)` decorator for AWS Batch workloads or `@kubernetes(use_tmpfs=True)` decorator for Kubernetes, the `metaflow.S3` client is automatically aware of the `tmpfs` volume and will use it to speed up downloads. This helps improve the performance of data downloads from S3 in Metaflow workflows.\n"
584 | ],
585 | "text/plain": [
586 | ""
587 | ]
588 | },
589 | "metadata": {},
590 | "output_type": "display_data"
591 | }
592 | ],
593 | "source": [
594 | "# same question as above. now we can answer it with the new index.\n",
595 | "question = \"How does Metaflow use `tmpfs`?\"\n",
596 | "dmqa(question, qa_iter(question, index_fast_data_post).response)"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": 81,
602 | "metadata": {},
603 | "outputs": [],
604 | "source": [
605 | "# indexes are updateable/composable! \n",
606 | "for doc_chunk in fast_data_doc:\n",
607 | " index.insert(doc_chunk)"
608 | ]
609 | },
610 | {
611 | "cell_type": "code",
612 | "execution_count": 82,
613 | "metadata": {},
614 | "outputs": [
615 | {
616 | "data": {
617 | "text/markdown": [
618 | "\n",
619 | "**Question:** How does Metaflow use `tmpfs`?\n",
620 | "\n",
621 | "**Answer:** Metaflow uses `tmpfs` by implementing support for memory-based `tmpfs` filesystem on Batch and Kubernetes. This allows users to create an ephemeral filesystem backed by memory on the fly, without having to make any changes on the infrastructure side. When the `tmpfs` feature is enabled, the `metaflow.S3` client automatically uses it to speed up downloads. To enable this feature, users can add `@batch(use_tmpfs=True)` for AWS Batch workloads or `@kubernetes(use_tmpfs=True)` for Kubernetes in their Metaflow code.\n"
622 | ],
623 | "text/plain": [
624 | ""
625 | ]
626 | },
627 | "metadata": {},
628 | "output_type": "display_data"
629 | },
630 | {
631 | "data": {
632 | "text/markdown": [
633 | "\n",
634 | "**Question:** How do I specify conda dependencies in my flow?\n",
635 | "\n",
636 | "**Answer:** You can specify conda dependencies in your flow using the `@conda_base` and `@conda` decorators. The `@conda_base` decorator is used at the flow level to specify explicit library dependencies, python version, and whether to exclude all steps from executing within a conda environment. The `@conda` decorator is used at the step level to update the explicit library dependencies, python version, and conda environment exclusion as specified by the `@conda_base` decorator. By using these decorators, you can define the conda environment for each step in your flow. Additionally, you can add an explicit dependency on a specific module by using the `@conda` decorator in the desired step.\n"
637 | ],
638 | "text/plain": [
639 | ""
640 | ]
641 | },
642 | "metadata": {},
643 | "output_type": "display_data"
644 | }
645 | ],
646 | "source": [
647 | "# same questions as above. now we can answer it with the new index.\n",
648 | "question = \"How does Metaflow use `tmpfs`?\"\n",
649 | "dmqa(question, qa_iter(question, index).response)\n",
650 | "\n",
651 | "# and this one too.\n",
652 | "question = \"How do I specify conda dependencies in my flow?\"\n",
653 | "dmqa(question, qa_iter(question, index).response)"
654 | ]
655 | },
656 | {
657 | "cell_type": "markdown",
658 | "metadata": {},
659 | "source": [
660 | "# 3. Constructing an index based on sections of endorsed content we can link to\n",
661 | "[Parse the Documents into Nodes](https://gpt-index.readthedocs.io/en/latest/end_to_end_tutorials/usage_pattern.html#parse-the-documents-into-nodes)"
662 | ]
663 | },
664 | {
665 | "cell_type": "markdown",
666 | "metadata": {},
667 | "source": [
668 | "## Fetch all file paths of .md files"
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": 83,
674 | "metadata": {},
675 | "outputs": [
676 | {
677 | "name": "stdout",
678 | "output_type": "stream",
679 | "text": [
680 | "Looking for remote repository at https://github.com/Netflix/metaflow-docs\n",
681 | "Looking for remote repository at https://github.com/huggingface/accelerate\n"
682 | ]
683 | }
684 | ],
685 | "source": [
686 | "from rag_tools.filetypes.markdown import Mixin as mm\n",
687 | "# this cell is like a condensed version of `/flows/markdown_chunker.py`\n",
688 | "_mm = mm()\n",
689 | "_mm.repo_params = REPO_PARAMS\n",
690 | "\n",
691 | "# this is an unprocessed df, so you may want to clean it as /flows/data_table_processor.py does.\n",
692 | "df = _mm.load_df_from_repo_list()"
693 | ]
694 | },
695 | {
696 | "cell_type": "code",
697 | "execution_count": 84,
698 | "metadata": {},
699 | "outputs": [],
700 | "source": [
701 | "documents, ids = get_documents_from_content_section_df(df)\n",
702 | "df['doc_ids'] = ids"
703 | ]
704 | },
705 | {
706 | "cell_type": "code",
707 | "execution_count": 85,
708 | "metadata": {},
709 | "outputs": [],
710 | "source": [
711 | "index = VectorStoreIndex.from_documents(documents)"
712 | ]
713 | },
714 | {
715 | "cell_type": "code",
716 | "execution_count": 100,
717 | "metadata": {},
718 | "outputs": [
719 | {
720 | "data": {
721 | "text/markdown": [
722 | "The above instructions work even if you use [`@conda`\n",
723 | " decorators](/scaling/dependencies#managing-dependencies-with-conda-decorator) in your\n",
724 | " code; you need, however, to ensure that the `conda` binary is available in your `PATH`.\n",
725 | " The easiest way to do this is to set the `PATH` environment variable to properly include\n",
726 | " the path to the `conda` binary if it is in a non-standard location. In VSCode, you can\n",
727 | " simply add this value in the env section of launch.json and in PyCharm, the UI allows\n",
728 | " you to set environment variables."
729 | ],
730 | "text/plain": [
731 | ""
732 | ]
733 | },
734 | "metadata": {},
735 | "output_type": "display_data"
736 | }
737 | ],
738 | "source": [
739 | "import random\n",
740 | "dm(random.choice(documents).text)"
741 | ]
742 | },
743 | {
744 | "cell_type": "markdown",
745 | "metadata": {},
746 | "source": [
747 | "# 4. Use Cases"
748 | ]
749 | },
750 | {
751 | "cell_type": "markdown",
752 | "metadata": {},
753 | "source": [
754 | "## Load a dataframe of text chunks and metadata from your latest workflow runs"
755 | ]
756 | },
757 | {
758 | "cell_type": "code",
759 | "execution_count": 6,
760 | "metadata": {},
761 | "outputs": [
762 | {
763 | "name": "stdout",
764 | "output_type": "stream",
765 | "text": [
766 | "1692847583518101\n"
767 | ]
768 | }
769 | ],
770 | "source": [
771 | "# find latest Metaflow run that saved processed df\n",
772 | "run = None\n",
773 | "for _run in Flow('DataTableProcessor'):\n",
774 | " if _run.data.save_processed_df:\n",
775 | " run = _run\n",
776 | " break\n",
777 | "\n",
778 | "print(run.id)"
779 | ]
780 | },
781 | {
782 | "cell_type": "code",
783 | "execution_count": 7,
784 | "metadata": {},
785 | "outputs": [],
786 | "source": [
787 | "df = run.data.processed_df\n",
788 | "documents, ids = get_documents_from_content_section_df(df)\n",
789 | "df['doc_id'] = ids\n",
790 | "index = VectorStoreIndex(documents)"
791 | ]
792 | },
793 | {
794 | "cell_type": "markdown",
795 | "metadata": {},
796 | "source": [
797 | "## Q&A"
798 | ]
799 | },
800 | {
801 | "cell_type": "code",
802 | "execution_count": 12,
803 | "metadata": {},
804 | "outputs": [
805 | {
806 | "name": "stdout",
807 | "output_type": "stream",
808 | "text": [
809 | "['1', '1209']\n"
810 | ]
811 | },
812 | {
813 | "data": {
814 | "text/markdown": [
815 | "#### What is Metaflow?"
816 | ],
817 | "text/plain": [
818 | ""
819 | ]
820 | },
821 | "metadata": {},
822 | "output_type": "display_data"
823 | },
824 | {
825 | "data": {
826 | "text/markdown": [
827 | "**Retrieved Response**"
828 | ],
829 | "text/plain": [
830 | ""
831 | ]
832 | },
833 | "metadata": {},
834 | "output_type": "display_data"
835 | },
836 | {
837 | "data": {
838 | "text/markdown": [
839 | "Metaflow is a Python library that simplifies the development, deployment, and operation of data-intensive applications, particularly those related to data science and machine learning. It was initially created at Netflix to enhance the productivity of data scientists working on various projects. Metaflow is available as an open-source framework under the Apache License, Version 2.0. It allows data scientists to focus on important aspects like feature engineering and model development while abstracting away tasks such as job organization, orchestration, scheduling, and interaction with data warehouses. Additionally, Metaflow enables the building of production-ready machine learning workflows using a simple Python API and facilitates seamless transitions between local prototyping environments and cloud-based deployments."
840 | ],
841 | "text/plain": [
842 | ""
843 | ]
844 | },
845 | "metadata": {},
846 | "output_type": "display_data"
847 | },
848 | {
849 | "data": {
850 | "text/markdown": [
851 | "#### Sources"
852 | ],
853 | "text/plain": [
854 | ""
855 | ]
856 | },
857 | "metadata": {},
858 | "output_type": "display_data"
859 | },
860 | {
861 | "data": {
862 | "text/markdown": [
863 | "##### [What is Metaflow](https://docs.metaflow.org/introduction/what-is-metaflow#what-is-metaflow)"
864 | ],
865 | "text/plain": [
866 | ""
867 | ]
868 | },
869 | "metadata": {},
870 | "output_type": "display_data"
871 | },
872 | {
873 | "data": {
874 | "text/markdown": [
875 | "Metaflow is a human-friendly Python library that makes it straightforward to develop, deploy, and op..."
876 | ],
877 | "text/plain": [
878 | ""
879 | ]
880 | },
881 | "metadata": {},
882 | "output_type": "display_data"
883 | },
884 | {
885 | "data": {
886 | "text/markdown": [
887 | "##### [Reproducible ML pipelines with Metaflow](https://outerbounds.com/blog/machine-learning-pipelines-from-prototype-to-production#reproducible-ml-pipelines-with-metaflow)"
888 | ],
889 | "text/plain": [
890 | ""
891 | ]
892 | },
893 | "metadata": {},
894 | "output_type": "display_data"
895 | },
896 | {
897 | "data": {
898 | "text/markdown": [
899 | "To help data scientists focus on the parts of the stack they really care about, such as feature engi..."
900 | ],
901 | "text/plain": [
902 | ""
903 | ]
904 | },
905 | "metadata": {},
906 | "output_type": "display_data"
907 | }
908 | ],
909 | "source": [
910 | "question = \"What is Metaflow?\"\n",
911 | "generative_search_engine_iter(question, index, df)"
912 | ]
913 | },
914 | {
915 | "cell_type": "code",
916 | "execution_count": 108,
917 | "metadata": {},
918 | "outputs": [
919 | {
920 | "data": {
921 | "text/markdown": [
922 | "#### How does Metaflow work with Kubernetes?"
923 | ],
924 | "text/plain": [
925 | ""
926 | ]
927 | },
928 | "metadata": {},
929 | "output_type": "display_data"
930 | },
931 | {
932 | "data": {
933 | "text/markdown": [
934 | "**Retrieved Response**"
935 | ],
936 | "text/plain": [
937 | ""
938 | ]
939 | },
940 | "metadata": {},
941 | "output_type": "display_data"
942 | },
943 | {
944 | "data": {
945 | "text/markdown": [
946 | "Metaflow now has the capability to run on top of any Kubernetes cluster. Users can run all or parts of any Metaflow flow on Kubernetes from their workstation by using the command \"run --with kubernetes\". Additionally, users can deploy their flow to Argo Workflows, a Kubernetes-native workflow scheduler, with a single command \"argo-workflows create\" to execute the flow asynchronously. For more information on setting up and operating Kubernetes for Metaflow, users can refer to the engineering resources provided by Metaflow."
947 | ],
948 | "text/plain": [
949 | ""
950 | ]
951 | },
952 | "metadata": {},
953 | "output_type": "display_data"
954 | },
955 | {
956 | "data": {
957 | "text/markdown": [
958 | "#### Sources"
959 | ],
960 | "text/plain": [
961 | ""
962 | ]
963 | },
964 | "metadata": {},
965 | "output_type": "display_data"
966 | },
967 | {
968 | "data": {
969 | "text/markdown": [
970 | "##### [Add capability to launch Metaflow tasks on Kubernetes and schedule Metaflow flows with Argo Workflows.](https://docs.metaflow.org/internals/release-notes#add-capability-to-launch-metaflow-tasks-on-kubernetes-and-schedule-metaflow-flows-with-argo-workflows)"
971 | ],
972 | "text/plain": [
973 | ""
974 | ]
975 | },
976 | "metadata": {},
977 | "output_type": "display_data"
978 | },
979 | {
980 | "data": {
981 | "text/markdown": [
982 | "This release enables brand new capabilities for [Metaflow on top of\n",
983 | " Kubernetes](https://outerbounds..."
984 | ],
985 | "text/plain": [
986 | ""
987 | ]
988 | },
989 | "metadata": {},
990 | "output_type": "display_data"
991 | },
992 | {
993 | "data": {
994 | "text/markdown": [
995 | "##### [Using Kubernetes](https://docs.metaflow.org/scaling/remote-tasks/kubernetes#using-kubernetes)"
996 | ],
997 | "text/plain": [
998 | ""
999 | ]
1000 | },
1001 | "metadata": {},
1002 | "output_type": "display_data"
1003 | },
1004 | {
1005 | "data": {
1006 | "text/markdown": [
1007 | "Here are some useful tips and tricks related to running Metaflow on Kubernetes. See our\n",
1008 | " engineering..."
1009 | ],
1010 | "text/plain": [
1011 | ""
1012 | ]
1013 | },
1014 | "metadata": {},
1015 | "output_type": "display_data"
1016 | }
1017 | ],
1018 | "source": [
1019 | "question = \"How does Metaflow work with Kubernetes?\"\n",
1020 | "generative_search_engine_iter(question, index, df)"
1021 | ]
1022 | },
1023 | {
1024 | "cell_type": "code",
1025 | "execution_count": 109,
1026 | "metadata": {},
1027 | "outputs": [
1028 | {
1029 | "data": {
1030 | "text/markdown": [
1031 | "#### What is a DAG?"
1032 | ],
1033 | "text/plain": [
1034 | ""
1035 | ]
1036 | },
1037 | "metadata": {},
1038 | "output_type": "display_data"
1039 | },
1040 | {
1041 | "data": {
1042 | "text/markdown": [
1043 | "**Retrieved Response**"
1044 | ],
1045 | "text/plain": [
1046 | ""
1047 | ]
1048 | },
1049 | "metadata": {},
1050 | "output_type": "display_data"
1051 | },
1052 | {
1053 | "data": {
1054 | "text/markdown": [
1055 | "A DAG, or directed acyclic graph, is a graph that consists of nodes connected by directed edges, where the edges have a specific direction and there are no cycles in the graph. In the context of Metaflow, a DAG is inferred based on the transitions between step functions. The nodes in the DAG are the steps, which represent operations, and the edges represent the transitions between steps. The DAG structure is important for defining the flow of execution and dependencies between steps in Metaflow."
1056 | ],
1057 | "text/plain": [
1058 | ""
1059 | ]
1060 | },
1061 | "metadata": {},
1062 | "output_type": "display_data"
1063 | },
1064 | {
1065 | "data": {
1066 | "text/markdown": [
1067 | "#### Sources"
1068 | ],
1069 | "text/plain": [
1070 | ""
1071 | ]
1072 | },
1073 | "metadata": {},
1074 | "output_type": "display_data"
1075 | },
1076 | {
1077 | "data": {
1078 | "text/markdown": [
1079 | "##### [Graph](https://docs.metaflow.org/internals/technical-overview#graph)"
1080 | ],
1081 | "text/plain": [
1082 | ""
1083 | ]
1084 | },
1085 | "metadata": {},
1086 | "output_type": "display_data"
1087 | },
1088 | {
1089 | "data": {
1090 | "text/markdown": [
1091 | "Metaflow infers a directed (typically acyclic) graph based on the transitions between\n",
1092 | " step function..."
1093 | ],
1094 | "text/plain": [
1095 | ""
1096 | ]
1097 | },
1098 | "metadata": {},
1099 | "output_type": "display_data"
1100 | },
1101 | {
1102 | "data": {
1103 | "text/markdown": [
1104 | "##### [The Structure of Metaflow Code](https://docs.metaflow.org/metaflow/basics#the-structure-of-metaflow-code)"
1105 | ],
1106 | "text/plain": [
1107 | ""
1108 | ]
1109 | },
1110 | "metadata": {},
1111 | "output_type": "display_data"
1112 | },
1113 | {
1114 | "data": {
1115 | "text/markdown": [
1116 | "Metaflow follows [the dataflow\n",
1117 | " paradigm](https://en.wikipedia.org/wiki/Dataflow_programming) which ..."
1118 | ],
1119 | "text/plain": [
1120 | ""
1121 | ]
1122 | },
1123 | "metadata": {},
1124 | "output_type": "display_data"
1125 | }
1126 | ],
1127 | "source": [
1128 | "question = \"What is a DAG?\"\n",
1129 | "generative_search_engine_iter(question, index, df)"
1130 | ]
1131 | },
1132 | {
1133 | "cell_type": "markdown",
1134 | "metadata": {},
1135 | "source": [
1136 | "## RAG for a generic sales pitch "
1137 | ]
1138 | },
1139 | {
1140 | "cell_type": "markdown",
1141 | "metadata": {},
1142 | "source": [
1143 | "### A custom prompt template in pure Python\n",
1144 | "Many tools exist to make prompts easy to manage. [Langchain](https://www.langchain.com/) is an emerging leader in this space."
1145 | ]
1146 | },
1147 | {
1148 | "cell_type": "code",
1149 | "execution_count": 126,
1150 | "metadata": {},
1151 | "outputs": [],
1152 | "source": [
1153 | "prompt_template = \"\"\"\n",
1154 | "Write an introduction email to a potential technical user who works as a {role} at {company}.\n",
1155 | "\n",
1156 | "Do not directly address the user's role or company anywhere in the email.\n",
1157 | "\n",
1158 | "Write the email for a technical audience who doesn't want to read marketing copy.\n",
1159 | "\n",
1160 | "Highlight Metaflow features related to their interests including {interests}.\n",
1161 | "\n",
1162 | "Include a summary motivating the benefits of these features by summarizing this context about Metaflow:\n",
1163 | "{context_about_interests}\n",
1164 | "\n",
1165 | "Make a subtle reference that Outerbounds platform can help them with {enterprise_platform_interest_hook}.\n",
1166 | "\n",
1167 | "Include a summary motivating the benefits of Outerbounds platform by summarizing this context about Outerbounds platform:\n",
1168 | "{context_about_enterprise_platform_interest_hook}\n",
1169 | "\n",
1170 | "Make the CTA to schedule a meeting to discuss how Outerbounds platform can help them.\n",
1171 | "\n",
1172 | "Make the email as short as possible. \n",
1173 | "\n",
1174 | "Do not reference your own profession or any experiences. Do not talk about yourself.\n",
1175 | "\n",
1176 | "Do not explicitly reference the company that the receiver works for. Only implicitly use this knowledge to demonstrate knowledge of the problems their organization may face.\n",
1177 | "\n",
1178 | "Avoid speaking from the first person.\n",
1179 | "\n",
1180 | "Avoid directly saying that you know about anyone's past experience or background. \n",
1181 | "\n",
1182 | "Avoid saying anything with similar sentiment to these statements:\n",
1183 | "
- Author Metaflow flows using notebooks\n",
1184 | "
- Refer to an interest listed above as something Metaflow works with\n",
1185 | "\"\"\""
1186 | ]
1187 | },
1188 | {
1189 | "cell_type": "markdown",
1190 | "metadata": {},
1191 | "source": [
1192 | "### Prompt engineering 101\n",
1193 | "Inject some relevant context into our prompt template."
1194 | ]
1195 | },
1196 | {
1197 | "cell_type": "code",
1198 | "execution_count": 127,
1199 | "metadata": {},
1200 | "outputs": [],
1201 | "source": [
1202 | "things_we_learned_about_a_prospect = dict(\n",
1203 | " role=\"data scientist\",\n",
1204 | " company=\"Big Industries Co.\",\n",
1205 | "\n",
1206 | " # comma-separated lists\n",
1207 | " interests=\"mlops, deep learning, kubernetes\",\n",
1208 | " enterprise_platform_interest_hook=\"CI/CD, security\",\n",
1209 | ")"
1210 | ]
1211 | },
1212 | {
1213 | "cell_type": "markdown",
1214 | "metadata": {},
1215 | "source": [
1216 | "### RAGify! \n",
1217 | "Here we use the Q&A iteration you saw in previous sections, but instead of printing the results we are using them to \"augment\" the prompt."
1218 | ]
1219 | },
1220 | {
1221 | "cell_type": "code",
1222 | "execution_count": 128,
1223 | "metadata": {},
1224 | "outputs": [
1225 | {
1226 | "name": "stderr",
1227 | "output_type": "stream",
1228 | "text": [
1229 | "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:27<00:00, 9.04s/it]\n",
1230 | "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:20<00:00, 10.34s/it]\n"
1231 | ]
1232 | }
1233 | ],
1234 | "source": [
1235 | "# RAG step - fetch some context\n",
1236 | "from tqdm import tqdm\n",
1237 | "\n",
1238 | "interest_list = things_we_learned_about_a_prospect[\"interests\"].split(\", \")\n",
1239 | "context_about_interests = []\n",
1240 | "for interest in tqdm(interest_list):\n",
1241 | " context_about_interest = qa_iter(\n",
1242 | " f\"Describe how Metaflow and {interest} can be used together in ML workflows. Focus on Metaflow being used as a complimentary tool.\",\n",
1243 | " index,\n",
1244 | " )\n",
1245 | " context_about_interests.append(context_about_interest)\n",
1246 | "\n",
1247 | "enterprise_platform_interest_hook = things_we_learned_about_a_prospect[\n",
1248 | " \"enterprise_platform_interest_hook\"\n",
1249 | "].split(\", \")\n",
1250 | "context_about_enterprise_platform_interest_hook = []\n",
1251 | "for interest in tqdm(enterprise_platform_interest_hook):\n",
1252 | " context_about_interest = qa_iter(\n",
1253 | " f\"Describe how Metaflow and {interest} can be used together in ML workflows. Focus on Metaflow being used as a complimentary tool.\",\n",
1254 | " index,\n",
1255 | " )\n",
1256 | " context_about_enterprise_platform_interest_hook.append(context_about_interest)"
1257 | ]
1258 | },
1259 | {
1260 | "cell_type": "code",
1261 | "execution_count": 129,
1262 | "metadata": {},
1263 | "outputs": [],
1264 | "source": [
1265 | "# unpack context injection data to encourage the generative LLM to embed the links in its Markdown response.\n",
1266 | "\n",
1267 | "def prepare_with_links(context_list: List[Context]):\n",
1268 | " def strip_numbers_and_punctuation(header):\n",
1269 | " import string\n",
1270 | " import re\n",
1271 | "\n",
1272 | " return re.sub(\n",
1273 | " r\"\\d\", \"\", header.translate(str.maketrans(\"\", \"\", string.punctuation))\n",
1274 | " )\n",
1275 | "\n",
1276 | " context_to_inject = \"\"\"\n",
1277 | "The results will be given to you in a format like\n",
1278 | "
- \"Text\": [Comma-separated list of [Link Label](Link URL)]\n",
1279 | "Use the [Link Label](Link URL) syntax in the summary, and use the links in context of the paragraph.\n",
1280 | "\n",
1281 | "These are the results to summarize:\n",
1282 | "\"\"\"\n",
1283 | " for _context in context_list:\n",
1284 | " similar_chunk_df = _context.get_link_df(df)\n",
1285 | " links = []\n",
1286 | " for header, url in list(\n",
1287 | " zip(similar_chunk_df.header.values, similar_chunk_df.page_url.values)\n",
1288 | " ):\n",
1289 | " links.append(f\"[{strip_numbers_and_punctuation(header)}]({url})\")\n",
1290 | " context_to_inject += (\n",
1291 | " \"
\" + \" -\" + ' \"' + _context.response + '\": ' + \", \".join(links)\n",
1292 | " )\n",
1293 | " return context_to_inject"
1294 | ]
1295 | },
1296 | {
1297 | "cell_type": "code",
1298 | "execution_count": 130,
1299 | "metadata": {},
1300 | "outputs": [],
1301 | "source": [
1302 | "prompt = prompt_template.format(\n",
1303 | " **things_we_learned_about_a_prospect,\n",
1304 | " context_about_interests=prepare_with_links(context_about_interests),\n",
1305 | " context_about_enterprise_platform_interest_hook=prepare_with_links(\n",
1306 | " context_about_enterprise_platform_interest_hook\n",
1307 | " )\n",
1308 | ")"
1309 | ]
1310 | },
1311 | {
1312 | "cell_type": "code",
1313 | "execution_count": 131,
1314 | "metadata": {},
1315 | "outputs": [
1316 | {
1317 | "name": "stdout",
1318 | "output_type": "stream",
1319 | "text": [
1320 | "Enter a negative sentiment to avoid by append to the prompt with an 'avoid' instruction, or press enter to continue, or type r then enter to restart: \n"
1321 | ]
1322 | }
1323 | ],
1324 | "source": [
1325 | "# append negative sentiment escapes to prompt.\n",
1326 | "# treat these like UX escape hatches, where you can always just stuff extra things in, until you reach the model's context width.\n",
1327 | "user_interaction = True\n",
1328 | "negative_sentiment_context_lines = []\n",
1329 | "while user_interaction:\n",
1330 | "\n",
1331 | " if len(negative_sentiment_context_lines) == 0:\n",
1332 | " user_input = input(\n",
1333 | " \"Enter a negative sentiment to avoid by append to the prompt with an 'avoid' instruction, or press enter to continue, or type r then enter to restart: \"\n",
1334 | " )\n",
1335 | " else:\n",
1336 | " print(\"Current list of negative sentiment statements:\")\n",
1337 | " for line in negative_sentiment_context_lines:\n",
1338 | " print(line)\n",
1339 | " user_input = input(\n",
1340 | " \"Single enter to continue, double enter to complete list of results of negative sentiment statements.\"\n",
1341 | " )\n",
1342 | " # process user input\n",
1343 | " if user_input == \"\":\n",
1344 | " user_interaction = False\n",
1345 | " elif user_input == \"r\":\n",
1346 | " negative_sentiment_context_lines = []\n",
1347 | " else:\n",
1348 | " negative_sentiment_context_lines.append(user_input)\n",
1349 | "\n",
1350 | "for line in negative_sentiment_context_lines:\n",
1351 | " prompt += \"
\" + \" -\" + \" \" + line"
1352 | ]
1353 | },
1354 | {
1355 | "cell_type": "markdown",
1356 | "metadata": {},
1357 | "source": [
1358 | "### View the final prompt the model will see"
1359 | ]
1360 | },
1361 | {
1362 | "cell_type": "code",
1363 | "execution_count": 132,
1364 | "metadata": {},
1365 | "outputs": [
1366 | {
1367 | "data": {
1368 | "text/markdown": [
1369 | "##### The first five hundred chars"
1370 | ],
1371 | "text/plain": [
1372 | ""
1373 | ]
1374 | },
1375 | "metadata": {},
1376 | "output_type": "display_data"
1377 | },
1378 | {
1379 | "data": {
1380 | "text/markdown": [
1381 | "\n",
1382 | "Write an introduction email to a potential technical user who works as a data scientist at Big Industries Co..\n",
1383 | "\n",
1384 | "Do not directly address the user's role or company anywhere in the email.\n",
1385 | "\n",
1386 | "Write the email for a technical audience who doesn't want to read marketing copy.\n",
1387 | "\n",
1388 | "Highlight Metaflow features related to their interests including mlops, deep learning, kubernetes.\n",
1389 | "\n",
1390 | "Include a summary motivating the benefits of these features by summarizing this context about Metaflow:\n",
1391 | "\n",
1392 | "The results will be give..."
1393 | ],
1394 | "text/plain": [
1395 | ""
1396 | ]
1397 | },
1398 | "metadata": {},
1399 | "output_type": "display_data"
1400 | }
1401 | ],
1402 | "source": [
1403 | "dm(\"##### The first five hundred chars\") \n",
1404 | "dm(prompt[:500] + '...')"
1405 | ]
1406 | },
1407 | {
1408 | "cell_type": "markdown",
1409 | "metadata": {},
1410 | "source": [
1411 | "### See your RAG app in action! "
1412 | ]
1413 | },
1414 | {
1415 | "cell_type": "code",
1416 | "execution_count": 133,
1417 | "metadata": {},
1418 | "outputs": [
1419 | {
1420 | "data": {
1421 | "text/markdown": [
1422 | "##### Generated email"
1423 | ],
1424 | "text/plain": [
1425 | ""
1426 | ]
1427 | },
1428 | "metadata": {},
1429 | "output_type": "display_data"
1430 | },
1431 | {
1432 | "data": {
1433 | "text/markdown": [
1434 | "Subject: Enhancing ML Workflows with Metaflow and Outerbounds Platform\n",
1435 | "\n",
1436 | "Dear [Recipient],\n",
1437 | "\n",
1438 | "I hope this email finds you well. I wanted to reach out to introduce you to Metaflow, a powerful tool that can enhance your data science workflows and help you achieve scalability, reproducibility, and production-readiness.\n",
1439 | "\n",
1440 | "Metaflow can be used as a complimentary tool in ML workflows, particularly when combined with MLOps practices. By integrating Metaflow into your pipeline, you can leverage its capabilities to build and deploy ML models more efficiently. It takes care of low-level infrastructure such as data, compute, orchestration, and versioning, allowing you to focus on the fun parts of building applications and models. [Metaflow takes care of the plumbing so you can focus on the fun parts](https://docs.metaflow.org/introduction/why-metaflow#10-metaflow-takes-care-of-the-plumbing-so-you-can-focus-on-the-fun-parts).\n",
1441 | "\n",
1442 | "For deep learning projects, Metaflow provides a robust and user-friendly foundation. It covers the full stack of DS/ML infrastructure, allowing you to focus on iterating on ideas quickly and deploying them confidently. [Metaflow covers the full stack of DS/ML infrastructure](https://docs.metaflow.org/introduction/why-metaflow#9-metaflow-covers-the-full-stack-of-ds-ml-infrastructure).\n",
1443 | "\n",
1444 | "If you're working with Kubernetes, Metaflow seamlessly integrates with it to leverage scalable infrastructure for running ML/DS applications. This makes it suitable for both small and large organizations. [Metaflow relies on systems that engineers know and trust](https://docs.metaflow.org/introduction/why-metaflow#11-metaflow-relies-on-systems-that-engineers-know-and-trust).\n",
1445 | "\n",
1446 | "In addition to Metaflow, I wanted to mention Outerbounds Platform, which can further enhance your ML workflows. It offers CI/CD capabilities, ensuring that changes to ML models and data pipelines are thoroughly tested and deployed in a reliable and efficient manner. Outerbounds Platform also prioritizes security, respecting your company's security policies and providing a secure environment for executing data science projects.\n",
1447 | "\n",
1448 | "I would love to schedule a meeting to discuss how Metaflow and Outerbounds Platform can specifically benefit your organization and address any challenges you may be facing. Please let me know a time that works for you, and I will be happy to set up a call.\n",
1449 | "\n",
1450 | "Looking forward to hearing from you.\n",
1451 | "\n",
1452 | "Best regards,\n",
1453 | "[Your Name]"
1454 | ],
1455 | "text/plain": [
1456 | ""
1457 | ]
1458 | },
1459 | "metadata": {},
1460 | "output_type": "display_data"
1461 | }
1462 | ],
1463 | "source": [
1464 | "query_engine = index.as_query_engine()\n",
1465 | "query_res = query_engine.query(prompt)\n",
1466 | "response = query_res.response\n",
1467 | "dm(\"##### Generated email\")\n",
1468 | "dm(response)"
1469 | ]
1470 | },
1471 | {
1472 | "cell_type": "markdown",
1473 | "metadata": {},
1474 | "source": [
1475 | "## Chatbot"
1476 | ]
1477 | },
1478 | {
1479 | "cell_type": "code",
1480 | "execution_count": 33,
1481 | "metadata": {},
1482 | "outputs": [],
1483 | "source": [
1484 | "repos = [\n",
1485 | " {\n",
1486 | " \"deployment_url\": \"docs.metaflow.org\",\n",
1487 | " \"repository_path\": os.path.expanduser(\"~/Dev/metaflow-docs\"),\n",
1488 | " \"repository_ref\": \"master\",\n",
1489 | " \"base_search_path\": \"docs\",\n",
1490 | " \"exclude_paths\": [\"docs/v\"],\n",
1491 | " \"exclude_files\": [\"README.md\", \"README\"],\n",
1492 | " }\n",
1493 | "]"
1494 | ]
1495 | },
1496 | {
1497 | "cell_type": "code",
1498 | "execution_count": 34,
1499 | "metadata": {},
1500 | "outputs": [],
1501 | "source": [
1502 | "def local_repos_to_docs(repos):\n",
1503 | " documents = []\n",
1504 | " for params in repos:\n",
1505 | " md_files = DocumentationExtractor().filter_files(\n",
1506 | " params[\"repository_path\"],\n",
1507 | " base_search_path=params[\"base_search_path\"],\n",
1508 | " exclude_paths=params[\"exclude_paths\"],\n",
1509 | " exclude_files=params[\"exclude_files\"],\n",
1510 | " considered_extensions=[\".md\"]\n",
1511 | " )\n",
1512 | " md_files = [abs_path for abs_path, _ in md_files]\n",
1513 | " documents += get_documents_from_md_file_paths(md_files)\n",
1514 | " return documents"
1515 | ]
1516 | },
1517 | {
1518 | "cell_type": "code",
1519 | "execution_count": 35,
1520 | "metadata": {},
1521 | "outputs": [
1522 | {
1523 | "data": {
1524 | "text/plain": [
1525 | "[Document(id_='818c7d59-7d59-4c9a-8e26-be8fa66929f6', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='168707b49cebf5424ed48615defbba683a0d7932f8451d263d491040c37aa87f', text='\\n\\nWelcome to Metaflow\\n\\nMetaflow makes it easy to build and manage real-life data science and machine learning projects.\\n\\n\\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'),\n",
1526 | " Document(id_='baebefe9-9cf4-4c6b-8831-3fec827af036', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='0c733db8a7e3a129b3c47977e53aa8d0ee5e0942e768f40d822a65e1da4f9aea', text='\\n\\nMotivation\\n\\n- Why Metaflow\\n- What is Metaflow\\n- Metaflow Resources\\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]"
1527 | ]
1528 | },
1529 | "execution_count": 35,
1530 | "metadata": {},
1531 | "output_type": "execute_result"
1532 | }
1533 | ],
1534 | "source": [
1535 | "documents = local_repos_to_docs(repos)\n",
1536 | "documents[:2]"
1537 | ]
1538 | },
1539 | {
1540 | "cell_type": "code",
1541 | "execution_count": 36,
1542 | "metadata": {},
1543 | "outputs": [],
1544 | "source": [
1545 | "temp = 0.0\n",
1546 | "model = \"gpt-3.5-turbo\"\n",
1547 | "chat_mode = \"react\""
1548 | ]
1549 | },
1550 | {
1551 | "cell_type": "code",
1552 | "execution_count": 37,
1553 | "metadata": {},
1554 | "outputs": [],
1555 | "source": [
1556 | "service_context = ServiceContext.from_defaults(llm=OpenAI(model=model, temperature=temp))\n",
1557 | "index = VectorStoreIndex.from_documents(documents, service_context=service_context)"
1558 | ]
1559 | },
1560 | {
1561 | "cell_type": "code",
1562 | "execution_count": 38,
1563 | "metadata": {},
1564 | "outputs": [],
1565 | "source": [
1566 | "chat_engine = index.as_chat_engine(service_context=service_context, chat_mode=chat_mode, verbose=True)"
1567 | ]
1568 | },
1569 | {
1570 | "cell_type": "code",
1571 | "execution_count": 48,
1572 | "metadata": {},
1573 | "outputs": [
1574 | {
1575 | "name": "stdout",
1576 | "output_type": "stream",
1577 | "text": [
1578 | "\u001b[38;5;200m\u001b[1;3mThought: I need to use a tool to help me answer the question.\n",
1579 | "Action: query_engine_tool\n",
1580 | "Action Input: {'input': 'How does Metaflow help AI developers?'}\n",
1581 | "\u001b[0m\u001b[36;1m\u001b[1;3mObservation: Metaflow helps AI developers by providing a robust and user-friendly foundation for data-intensive applications. It takes care of the low-level infrastructure such as data, compute, orchestration, and versioning, allowing developers to focus on building their own applications, models, and policies on top of it. This means that AI developers who have a basic understanding of Python can leverage Metaflow to streamline their development process and focus on the more enjoyable aspects of their work.\n",
1582 | "\u001b[0m"
1583 | ]
1584 | }
1585 | ],
1586 | "source": [
1587 | "response = chat_engine.chat(\"How does Metaflow help AI developers?\")"
1588 | ]
1589 | },
1590 | {
1591 | "cell_type": "code",
1592 | "execution_count": 34,
1593 | "metadata": {},
1594 | "outputs": [
1595 | {
1596 | "data": {
1597 | "text/markdown": [
1598 | "Metaflow helps AI developers by providing a robust and user-friendly foundation for data-intensive applications. It takes care of the low-level infrastructure such as data, compute, orchestration, and versioning, allowing developers to focus on building their own applications, models, and policies on top of it. This means that AI developers who have a basic understanding of Python can leverage Metaflow to streamline their development process and focus on the more enjoyable aspects of their work."
1599 | ],
1600 | "text/plain": [
1601 | ""
1602 | ]
1603 | },
1604 | "metadata": {},
1605 | "output_type": "display_data"
1606 | }
1607 | ],
1608 | "source": [
1609 | "dm(response.response)"
1610 | ]
1611 | },
1612 | {
1613 | "cell_type": "code",
1614 | "execution_count": 36,
1615 | "metadata": {},
1616 | "outputs": [
1617 | {
1618 | "name": "stdout",
1619 | "output_type": "stream",
1620 | "text": [
1621 | "\u001b[38;5;200m\u001b[1;3mThought: I have already answered this question. I can provide a summary of my previous response to help answer the question again.\n",
1622 | "Action: query_engine_tool\n",
1623 | "Action Input: {'input': \"The enjoyable aspects of AI developers' work\"}\n",
1624 | "\u001b[0m\u001b[36;1m\u001b[1;3mObservation: The enjoyable aspects of AI developers' work include the ability to create value with machine learning, similar to traditional software engineering. The development and deployment of ML systems are expected to mature and improve over time, just as software development has done in the past 20 years. Efforts from various entities, such as governments, open source communities, and for-profit companies, are being made to articulate risks, develop best practices, and provide tooling to support ML development. This indicates a positive prognosis for the future of AI development.\n",
1625 | "\u001b[0m\u001b[38;5;200m\u001b[1;3mResponse: The enjoyable aspects of AI developers' work include the ability to create value with machine learning, the opportunity for continuous learning and staying updated with the latest advancements, the potential for impactful applications, collaboration with interdisciplinary teams, the freedom to experiment and innovate, and the automation of repetitive tasks. Additionally, efforts from various entities are being made to support and improve the development and deployment of ML systems, indicating a positive prognosis for the future of AI development.\n",
1626 | "\u001b[0m"
1627 | ]
1628 | },
1629 | {
1630 | "data": {
1631 | "text/markdown": [
1632 | "The enjoyable aspects of AI developers' work include the ability to create value with machine learning, the opportunity for continuous learning and staying updated with the latest advancements, the potential for impactful applications, collaboration with interdisciplinary teams, the freedom to experiment and innovate, and the automation of repetitive tasks. Additionally, efforts from various entities are being made to support and improve the development and deployment of ML systems, indicating a positive prognosis for the future of AI development."
1633 | ],
1634 | "text/plain": [
1635 | ""
1636 | ]
1637 | },
1638 | "metadata": {},
1639 | "output_type": "display_data"
1640 | }
1641 | ],
1642 | "source": [
1643 | "response = chat_engine.chat(\"What are the enjoyable aspects of their work?\")\n",
1644 | "dm(response.response)"
1645 | ]
1646 | },
1647 | {
1648 | "cell_type": "code",
1649 | "execution_count": 69,
1650 | "metadata": {},
1651 | "outputs": [],
1652 | "source": [
1653 | "# chat_engine.chat_repl()"
1654 | ]
1655 | }
1656 | ],
1657 | "metadata": {
1658 | "kernelspec": {
1659 | "display_name": "RAG demo",
1660 | "language": "python",
1661 | "name": "rag-demo"
1662 | },
1663 | "language_info": {
1664 | "codemirror_mode": {
1665 | "name": "ipython",
1666 | "version": 3
1667 | },
1668 | "file_extension": ".py",
1669 | "mimetype": "text/x-python",
1670 | "name": "python",
1671 | "nbconvert_exporter": "python",
1672 | "pygments_lexer": "ipython3",
1673 | "version": "3.11.4"
1674 | }
1675 | },
1676 | "nbformat": 4,
1677 | "nbformat_minor": 4
1678 | }
1679 |
--------------------------------------------------------------------------------
/notebooks/markdown_repo_explorer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Extending these workflows to other repositories\n",
8 | "**IMPORTANT**: Please be respectful of understanding the licenses of repositories you choose to crawl with any tools, including those provided in this demonstration. This repository is intended to help you quickly get started indexing documentation in GitHub repositories, not for large-scale web scraping."
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## The main tool"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {
22 | "vscode": {
23 | "languageId": "plaintext"
24 | }
25 | },
26 | "outputs": [],
27 | "source": [
28 | "from rag_tools.repo.ops import DocumentationExtractor"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "The `DocumentationExtractor` has two functions, `filter_files` and `extract`."
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "## Finding files to chunk\n",
43 | "First, a set of parameters defining how to crawl a GitHub repository is passed to a custom object defined in `rag_tools.repo.ops.DocumentationExtractor`. One option is to clone a repository and pass a local path to `filter_files` to return all the paths to the `considered_extensions`"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 7,
49 | "metadata": {
50 | "vscode": {
51 | "languageId": "plaintext"
52 | }
53 | },
54 | "outputs": [],
55 | "source": [
56 | "import os\n",
57 | "METAFLOW_DOCS_REPO_PATH = os.path.expanduser(\"~/Dev/metaflow-docs\")\n",
58 | "file_paths = DocumentationExtractor().filter_files(\n",
59 | " METAFLOW_DOCS_REPO_PATH,\n",
60 | " base_search_path = \"docs\",\n",
61 | " exclude_paths = [\"docs/v\"],\n",
62 | " exclude_files = [\"README.md\", \"README\"],\n",
63 | " considered_extensions = [\".md\"],\n",
64 | ")"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 9,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/plain": [
75 | "(58, ('/Users/eddie/Dev/metaflow-docs/docs/index.md', 'index.md'))"
76 | ]
77 | },
78 | "execution_count": 9,
79 | "metadata": {},
80 | "output_type": "execute_result"
81 | }
82 | ],
83 | "source": [
84 | "len(file_paths), file_paths[0]"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "Another workflow option is to only download the repository temporarily by passing the remote repositories URL to `extract`, and letting this function handle the `filter_files` call and any other parsing internally."
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 19,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "import pandas as pd\n",
101 | "from functools import partial\n",
102 | "from rag_tools.filetypes.markdown import parse_md_file_headers\n",
103 | "\n",
104 | "# we will pass this to the extract(parser=...) arg. \n",
105 | "# you can replace this with any function that takes in filepath and filename args.\n",
106 | "def headers_df_parser(filepath, filename, deployment_url, subdir) -> pd.DataFrame:\n",
107 | " return parse_md_file_headers(filepath, deployment_url=deployment_url, subdir=subdir)"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 20,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "params = {\n",
117 | " \"deployment_url\": \"docs.metaflow.org\",\n",
118 | " \"repository_path\": \"https://github.com/Netflix/metaflow-docs\",\n",
119 | " \"repository_ref\": \"master\",\n",
120 | " \"base_search_path\": \"docs\",\n",
121 | " \"exclude_paths\": [\"docs/v\"],\n",
122 | " \"exclude_files\": [\"README.md\", \"README\"],\n",
123 | "}"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 21,
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "extractor = DocumentationExtractor(repo_url=params[\"repository_path\"])\n",
133 | "\n",
134 | "dfs = extractor.extract(\n",
135 | " base_path=params[\"base_search_path\"],\n",
136 | " ref=params[\"repository_ref\"],\n",
137 | " exclude_paths=params[\"exclude_paths\"],\n",
138 | " exclude_files=params[\"exclude_files\"],\n",
139 | " considered_extensions=[\".md\"],\n",
140 | " parser=partial(headers_df_parser, deployment_url=params['deployment_url'], subdir=params['base_search_path']),\n",
141 | ")"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 24,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "df = pd.concat(dfs)"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 26,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/html": [
161 | "\n",
162 | "\n",
175 | "
\n",
176 | " \n",
177 | " \n",
178 | " | \n",
179 | " header | \n",
180 | " contents | \n",
181 | " type | \n",
182 | " page_url | \n",
183 | " is_howto | \n",
184 | " char_count | \n",
185 | " word_count | \n",
186 | "
\n",
187 | " \n",
188 | " \n",
189 | " \n",
190 | " 10 | \n",
191 | " **Store and load objects to/from a known S3 lo... | \n",
192 | " The above examples inferred the S3 location ba... | \n",
193 | " H4 | \n",
194 | " https://docs.metaflow.org/scaling/data#store-a... | \n",
195 | " False | \n",
196 | " 1367 | \n",
197 | " 219 | \n",
198 | "
\n",
199 | " \n",
200 | " 0 | \n",
201 | " Deploying Variants of Event-Triggered Flows | \n",
202 | " Consider this advanced scenario: You have depl... | \n",
203 | " H1 | \n",
204 | " https://docs.metaflow.org/production/event-tri... | \n",
205 | " False | \n",
206 | " 983 | \n",
207 | " 119 | \n",
208 | "
\n",
209 | " \n",
210 | " 99 | \n",
211 | " [Fix `environment is not callable` error when ... | \n",
212 | " Using `@environment` would often result in an ... | \n",
213 | " H4 | \n",
214 | " https://docs.metaflow.org/internals/release-no... | \n",
215 | " False | \n",
216 | " 279 | \n",
217 | " 34 | \n",
218 | "
\n",
219 | " \n",
220 | " 71 | \n",
221 | " State Machine execution history logging for AW... | \n",
222 | " Metaflow now logs [State Machine execution his... | \n",
223 | " H4 | \n",
224 | " https://docs.metaflow.org/internals/release-no... | \n",
225 | " False | \n",
226 | " 587 | \n",
227 | " 72 | \n",
228 | "
\n",
229 | " \n",
230 | " 3 | \n",
231 | " Single Flow, multiple developers | \n",
232 | " If `ProjectFlow` did not have a `@project deco... | \n",
233 | " H3 | \n",
234 | " https://docs.metaflow.org/production/coordinat... | \n",
235 | " False | \n",
236 | " 1773 | \n",
237 | " 232 | \n",
238 | "
\n",
239 | " \n",
240 | "
\n",
241 | "
"
242 | ],
243 | "text/plain": [
244 | " header \\\n",
245 | "10 **Store and load objects to/from a known S3 lo... \n",
246 | "0 Deploying Variants of Event-Triggered Flows \n",
247 | "99 [Fix `environment is not callable` error when ... \n",
248 | "71 State Machine execution history logging for AW... \n",
249 | "3 Single Flow, multiple developers \n",
250 | "\n",
251 | " contents type \\\n",
252 | "10 The above examples inferred the S3 location ba... H4 \n",
253 | "0 Consider this advanced scenario: You have depl... H1 \n",
254 | "99 Using `@environment` would often result in an ... H4 \n",
255 | "71 Metaflow now logs [State Machine execution his... H4 \n",
256 | "3 If `ProjectFlow` did not have a `@project deco... H3 \n",
257 | "\n",
258 | " page_url is_howto char_count \\\n",
259 | "10 https://docs.metaflow.org/scaling/data#store-a... False 1367 \n",
260 | "0 https://docs.metaflow.org/production/event-tri... False 983 \n",
261 | "99 https://docs.metaflow.org/internals/release-no... False 279 \n",
262 | "71 https://docs.metaflow.org/internals/release-no... False 587 \n",
263 | "3 https://docs.metaflow.org/production/coordinat... False 1773 \n",
264 | "\n",
265 | " word_count \n",
266 | "10 219 \n",
267 | "0 119 \n",
268 | "99 34 \n",
269 | "71 72 \n",
270 | "3 232 "
271 | ]
272 | },
273 | "execution_count": 26,
274 | "metadata": {},
275 | "output_type": "execute_result"
276 | }
277 | ],
278 | "source": [
279 | "df.sample(5)"
280 | ]
281 | }
282 | ],
283 | "metadata": {
284 | "kernelspec": {
285 | "display_name": "RAG demo",
286 | "language": "python",
287 | "name": "rag-demo"
288 | },
289 | "language_info": {
290 | "codemirror_mode": {
291 | "name": "ipython",
292 | "version": 3
293 | },
294 | "file_extension": ".py",
295 | "mimetype": "text/x-python",
296 | "name": "python",
297 | "nbconvert_exporter": "python",
298 | "pygments_lexer": "ipython3",
299 | "version": "3.11.4"
300 | }
301 | },
302 | "nbformat": 4,
303 | "nbformat_minor": 4
304 | }
305 |
--------------------------------------------------------------------------------
/rag_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/__init__.py
--------------------------------------------------------------------------------
/rag_tools/databases/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/databases/__init__.py
--------------------------------------------------------------------------------
/rag_tools/databases/vector_database.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | class AbstractDB:
5 |
6 | def __init__(self):
7 | pass
8 |
9 | def create_index(self, **kwargs):
10 | pass
11 |
12 | def upsert(self, **kwargs):
13 | pass
14 |
15 | def vector_search(self, **kwargs):
16 | pass
17 |
18 | def destroy_index(self, **kwargs):
19 | pass
20 |
21 |
22 | class PineconeDB(AbstractDB):
23 |
24 | def __init__(self,):
25 | super().__init__()
26 | import pinecone
27 | pinecone.init(
28 | api_key=os.environ['PINECONE_API_KEY'],
29 | environment=os.environ['GCP_ENVIRONMENT']
30 | )
31 |
32 | def create_index(self, index_name, dimension, metric='cosine'):
33 | import pinecone
34 |
35 | if index_name not in pinecone.list_indexes():
36 | pinecone.create_index(
37 | name=index_name,
38 | dimension=dimension,
39 | metric='cosine'
40 | )
41 | # wait a moment for the index to be fully initialized
42 | time.sleep(1)
43 |
44 | def upsert(self, index_name, embeddings, texts, ids):
45 | import pinecone
46 |
47 | # now connect to the index
48 | index = pinecone.GRPCIndex(index_name)
49 |
50 | # upsert the vectors, but this should be done in batches not one by one.
51 | print("Upserting vectors", end="")
52 | for idx, (txt, emb) in enumerate(zip(texts, embeddings)):
53 | upsert_response = index.upsert(
54 | vectors=[
55 | {'id': f'vec{idx}',
56 | 'values': emb.tolist(),
57 | 'metadata': {'text': txt},
58 | }
59 | ]
60 | )
61 | print(".", end="")
62 |
63 | def vector_search(self, index_name, vector, k=1):
64 | import pinecone
65 | index = pinecone.GRPCIndex(index_name)
66 | xc = index.query(vector.tolist(), top_k=k, include_metadata=True)
67 | return xc
68 |
69 | def destroy_index(self, index_name):
70 | import pinecone
71 | pinecone.delete_index(index_name)
72 |
73 |
74 | class LanceDB(AbstractDB):
75 |
76 | """
77 | LanceDB is a vector database that uses Lance to store and search vectors.
78 | """
79 |
80 | def __init__(self):
81 | super().__init__()
82 | self.mode = 'overwrite'
83 | self.dataset_path = "../../chatbot.lance"
84 | self.local_store = True
85 |
86 | def create_index(self, table_name, embeddings, texts, ids):
87 |
88 | import lance
89 | import pandas as pd
90 | import pyarrow as pa
91 | from lance.vector import vec_to_table
92 |
93 | data = pd.DataFrame({"text": texts, "id": ids})
94 | table = vec_to_table(embeddings)
95 | combined = pa.Table.from_pandas(data).append_column("vector", table["vector"])
96 |
97 | if self.local_store:
98 | ds = lance.write_dataset(combined, self.dataset_path, mode=self.mode)
99 |
100 | def upsert(self, table_name, embeddings, texts, ids):
101 | raise NotImplementedError("This LanceDB wrapper does not have upsert functionality beyond the create_index step yet.")
102 |
103 | def vector_search(self, table_name, vector, k=3):
104 | import lance
105 | ds = lance.dataset(self.dataset_path)
106 | return ds.to_table(
107 | nearest={
108 | "column": "vector",
109 | "k": k,
110 | "q": vector,
111 | "nprobes": 20,
112 | "refine_factor": 100
113 | }).to_pandas()
114 |
115 | def destroy_index(self, table_name):
116 | if self.local_store:
117 | import shutil
118 | shutil.rmtree(self.dataset_path)
--------------------------------------------------------------------------------
/rag_tools/embedders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/embedders/__init__.py
--------------------------------------------------------------------------------
/rag_tools/embedders/embedder.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | class AbstractEmbedder:
4 | def __init__(self, **kwargs):
5 | pass
6 |
7 | class OpenAIEmbedder(AbstractEmbedder):
8 |
9 | def __init__(self):
10 | super().__init__()
11 |
12 | def embed(self, sentences):
13 | pass
14 |
15 |
16 | class SentenceTransformerEmbedder(AbstractEmbedder):
17 | def __init__(self, model_path, device="cpu"):
18 | super().__init__()
19 | from sentence_transformers import SentenceTransformer
20 |
21 | self.model = SentenceTransformer(model_path)
22 | self.model.to(device)
23 |
24 | def embed(self, sentences):
25 | return self.model.encode(sentences, show_progress_bar=True)
26 |
27 |
28 | class FashionCLIPEmbedder(AbstractEmbedder):
29 | def __init__(self):
30 | super().__init__()
31 | from fashion_clip.fashion_clip import FashionCLIP
32 |
33 | self.fclip = FashionCLIP('fashion-clip')
34 |
35 | def embed(self, images):
36 | import numpy as np
37 | image_embeddings = self.fclip.encode_images(images, batch_size=32)
38 |
39 | image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True)
40 | return image_embeddings
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/rag_tools/filetypes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/outerbounds/rag-demo/fddf6e0059eb1a3a0ea5cda3b954290c3629a35c/rag_tools/filetypes/__init__.py
--------------------------------------------------------------------------------
/rag_tools/filetypes/markdown.py:
--------------------------------------------------------------------------------
1 | import re
2 | import json
3 | from functools import partial
4 | import os
5 | import pickle
6 | from datetime import datetime
7 | import frontmatter
8 | import re
9 | import pandas as pd
10 | from slugify import slugify
11 |
12 |
13 | HEADERS_REGEX = re.compile("#{1,5}")
14 | NUMBERED_HEADER_REGEX = re.compile("")
15 | END_ESCAPE = "< END >"
16 | QUESTION_HEADER = "## Question"
17 |
18 |
19 | def is_useful_line(line):
20 | """
21 | Utility function to decide if this is some autogenerated or unhelpful line from filetypes pages, which can be generated in a variety of ways.
22 | """
23 | # if line == "\n": # New lines are helpful in the sections to determine paragraph chunks. Might want to make this a generalized filter instead of hand coded heuristics.
24 | # return False
25 | if line == "---\n":
26 | return False
27 | if line == "\n---":
28 | return False
29 | if "