├── src ├── .python-version ├── sync.py ├── query_agent.py ├── mcp_server.py ├── pyproject.toml ├── scripts │ └── run-startup.py └── startup.py ├── data ├── readme.md ├── podcast-descriptions │ └── 132.txt └── podcast-transcripts │ ├── 129.txt │ ├── 130.txt │ └── 131.txt └── readme.md /src/.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /src/sync.py: -------------------------------------------------------------------------------- 1 | def sync_new_pods(): 2 | pass 3 | 4 | def _sync_new_pods(): 5 | pass -------------------------------------------------------------------------------- /data/readme.md: -------------------------------------------------------------------------------- 1 | # Weaviate Podcast Data 2 | 3 | - Podcast Descriptions (this is what is currently indexed) 4 | - Podcast Summaries 5 | - Podcast Transcripts -------------------------------------------------------------------------------- /src/query_agent.py: -------------------------------------------------------------------------------- 1 | def send_think_mode_request(): 2 | pass 3 | 4 | def send_ask_mode_request(): 5 | pass 6 | 7 | def send_search_mode_request(): 8 | pass -------------------------------------------------------------------------------- /src/mcp_server.py: -------------------------------------------------------------------------------- 1 | from fastmcp import Context, FastMCP 2 | 3 | class WeaviatePodcastMCP(FastMCP): 4 | def __init__(self): 5 | super().__init__() 6 | 7 | def setup_tools(self): 8 | """ 9 | Resgiter the tools in the MCP server. 10 | """ -------------------------------------------------------------------------------- /src/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "weaviate-podcast-search" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.11" 7 | dependencies = [ 8 | "weaviate-agents>=1.1.0", 9 | "weaviate-client>=4.19.0", 10 | ] 11 | -------------------------------------------------------------------------------- /src/scripts/run-startup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import weaviate 4 | 5 | from startup import startup 6 | 7 | weaviate_client = weaviate.connect_to_weaviate_cloud( 8 | cluster_url=os.getenv("WEAVIATE_URL"), 9 | auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WEAVIATE_API_KEY")), 10 | ) 11 | try: 12 | startup(weaviate_client) 13 | print("Startup completed successfully!") 14 | finally: 15 | weaviate_client.close() 16 | 17 | -------------------------------------------------------------------------------- /data/podcast-descriptions/132.txt: -------------------------------------------------------------------------------- 1 | Hey everyone! Thanks so much for watching the 132nd episode of the Weaviate Podcast with Thomas van Dongen, head of AI engineering at Springer Nature! Thomas is also the creator of Pyversity, a fast, lightweight library for diversifying retrieval results! Retrieval systems often return highly similar items. Pyversity efficiently re-ranks these results to encourage diversity, surfacing items that remain relevant but less redundant. It implements several popular diversification strategies such as MMR, MSD, DPP, and Cover with a clear, unified API. -------------------------------------------------------------------------------- /src/startup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import weaviate 4 | from weaviate.classes.config import Property, DataType 5 | 6 | def startup(weaviate_client: weaviate.Client): 7 | if not weaviate_client.collections.exists("WeaviatePodcast"): 8 | _create_pod_collections(weaviate_client) 9 | 10 | def _create_pod_collections(weaviate_client: weaviate.Client): 11 | return weaviate_client.collections.create( 12 | "WeaviatePodcast", 13 | properties=[ 14 | Property(name="pod_number", data_type=DataType.INT), 15 | Property(name="pod_summary", data_type=DataType.TEXT), 16 | ], 17 | ) 18 | 19 | def _upload_pods(collection: weaviate.Collection): 20 | pods = [] 21 | for pod_filename in os.listdir("data/podcast-summaries"): 22 | with open(f"data/podcast-summaries/{pod_filename}", "r") as f: 23 | pod_summary = f.read() 24 | pods.append({ 25 | "pod_number": pod_filename.split(".")[0], 26 | "pod_summary": pod_summary 27 | }) 28 | with collection.batch.fixed_size(batch_size=100) as batch: 29 | for pod in pods: 30 | batch.add_object( 31 | collection=collection, 32 | properties=pod 33 | ) -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # weaviate-podcast-MCP 2 | 3 | An MCP server for the Weaviate Podcast! 4 | 5 | ### Episode List 6 | 7 | [132] Pyversity with Thomas van Dongen 8 | - [YouTube](https://www.youtube.com/watch?v=L2N1qvfP7tg) 9 | - [Spotify](https://spotifycreators-web.app.link/e/FBaEZ9b5UYb) 10 | 11 | [131] Semantic Query Engines with Matthew Russo 12 | - [YouTube](https://www.youtube.com/watch?v=koPBr9W4qU0) 13 | - [Spotify](https://spotifycreators-web.app.link/e/WlPOhLtSUYb) 14 | 15 | [130] REFRAG with Xiaoqiang Lin 16 | - [YouTube](https://www.youtube.com/watch?v=yi7v-UXMg0U) 17 | - [Spotify](https://spotifycreators-web.app.link/e/TMG3ILtSUYb) 18 | 19 | [129] Weaviate and SAS with Saurabh Mishra and Bob van Luijt 20 | - [YouTube](https://www.youtube.com/watch?v=INKV21AaYjE) 21 | - [Spotify](https://spotifycreators-web.app.link/e/bBjTtMtSUYb) 22 | 23 | [128] Weaviate's Query Agent with Charles Pierse 24 | - [YouTube](https://www.youtube.com/watch?v=TRTHw6vdVso) 25 | - [Spotify](https://spotifycreators-web.app.link/e/bt9QUMtSUYb) 26 | 27 | [127] GEPA with Lakshya A. Agrawal 28 | - [YouTube](https://www.youtube.com/watch?v=fREQrxhBSk0) 29 | - [Spotify](https://spotifycreators-web.app.link/e/xBLNlNtSUYb) 30 | 31 | [126] Agentic Topic Modeling with Maarten Grootendorst 32 | - [YouTube](https://www.youtube.com/watch?v=Lt6CRZ7ypPA) 33 | - [Spotify](https://spotifycreators-web.app.link/e/G9vd7NtSUYb) 34 | 35 | [125] Sufficient Context with Hailey Joren 36 | - [YouTube](https://www.youtube.com/watch?v=EU8BUMJLd54) 37 | - [Spotify](https://spotifycreators-web.app.link/e/FA7aAOtSUYb) 38 | 39 | [124] RAG Benchmarks with Nandan Thakur 40 | - [YouTube](https://www.youtube.com/watch?v=x9zZ03XtAuY) 41 | - [Spotify](https://spotifycreators-web.app.link/e/uEuSjPtSUYb) 42 | 43 | [123] MUVERA with Rajesh Jayaram and Roberto Esposito 44 | - [YouTube](https://www.youtube.com/watch?v=nSW5g1H4zoU) 45 | - [Spotify](https://spotifycreators-web.app.link/e/nAni3PtSUYb) -------------------------------------------------------------------------------- /data/podcast-transcripts/129.txt: -------------------------------------------------------------------------------- 1 | 00:00:00,200 --> 00:00:17,360 [Speaker 0] 2 | Hey, everyone. Thank you so much for watching another episode of the Weaviate podcast. Today, we're diving into our partnership between Weaviate and SAS. SAS is doing all sorts of amazing things with the SAS Retrieval Agent Management- Manager in their partnership with Weaviate. I'm super excited to discuss these things further. And firstly, I'm super excited to welcome Weaviate co-founder, Bob van Luit. Bob, thank you- 3 | 4 | 00:00:17,360 --> 00:00:17,370 [Speaker 1] 5 | Woo 6 | 7 | 00:00:17,370 --> 00:00:19,040 [Speaker 0] 8 | ... so much for joining the podcast again. 9 | 10 | 00:00:19,040 --> 00:00:22,390 [Speaker 1] 11 | Thanks for having me again on our, our own podcast [laughs]. 12 | 13 | 00:00:22,390 --> 00:00:29,440 [Speaker 0] 14 | [laughs] Awesome. I'm also super excited to welcome Saurabh Mishra from SAS. Saurabh, thanks so much for joining the podcast. 15 | 16 | 00:00:29,440 --> 00:00:33,960 [Speaker 1] 17 | Hey, everyone. Uh, nice to have, uh, nice to be here. Thanks, Connor and Bob. 18 | 19 | 00:00:33,960 --> 00:00:37,020 [Speaker 0] 20 | Great. 21 | 22 | 00:00:37,020 --> 00:00:37,440 [Speaker 1] 23 | Uh- 24 | 25 | 00:00:37,440 --> 00:00:39,400 [Speaker 0] 26 | Awesome. Um, Bob, maybe you kick it off, um, I- 27 | 28 | 00:00:39,400 --> 00:00:40,740 [Speaker 1] 29 | Yeah, yeah, yeah. 30 | 31 | 00:00:40,740 --> 00:00:44,949 [Speaker 1] 32 | I will... I-... Because I'm- I'm always... Connor, you always have these great exciting, uh- 33 | 34 | 00:00:44,949 --> 00:00:44,949 [Speaker 0] 35 | [laughs] 36 | 37 | 00:00:44,949 --> 00:00:49,990 [Speaker 1] 38 | ... you know, you kick these things off, so I'm always like... I'm- I'm almost like cur... Is that coming more or is this... You know. 39 | 40 | 00:00:49,990 --> 00:00:50,490 [Speaker 0] 41 | [laughs] 42 | 43 | 00:00:50,490 --> 00:01:08,020 [Speaker 1] 44 | No, but it's Saurabh, it's- it's great to have you here. And it's like a... Just before we started, you said something super interesting because... And I think that it would be great to start there, is you talked about, um, uh, uh, h- what you're seeing around RAG and how those use cases are growing. 45 | 46 | 00:01:08,020 --> 00:01:08,030 [Speaker 2] 47 | Mm-hmm. 48 | 49 | 00:01:08,030 --> 00:01:47,580 [Speaker 1] 50 | And how we think about this at Weaviate is that we see like, basically we're just like, first, we had like retrieval. Then from retrieval, we got to RAG. And then from RAG, out of that, we got to the agents. Yeah. So I would love to get, like, your take or maybe even SAS' take on- on, like, how you see the development, how you see that happening with- with customers and with- with- with- with users. And then we can nicely go into, of course, what you've launched, but I would love to get your take also, l- maybe a little bit from a historic perspective, uh, on that. Yes. Yeah, we can definitely g- be a little bit historic here, which in today's time just goes back to last year, um, because that's [laughs]- 51 | 52 | 00:01:47,580 --> 00:01:49,660 [Speaker 0] 53 | Exactly, yes. 54 | 55 | 00:01:49,720 --> 00:06:38,352 [Speaker 1] 56 | That's when we started thinking about it. So we started thinking about RAG really, um, first part of last year, where we started to kind of, uh, see that, you know, obviously language models have been a craze at that time. Everybody was talking about, you know, the weekly average number of users that ChatGPT has and it's growing. But there was this- there was this challenge of, you know, kind of adapting it to an enterprise scenario. And we started to kind of dig deeper into that, and, you know, you quickly find out that the majority of enterprise data is unstructured. And for- for you to kind of unlock enterprise use cases, you gotta be able to take general-purpose tooling, like the language models, and kind of tailor them to a scenario where they can be applied to that enterprise data. And RAG just seemed like the right pattern, so that's kind of... You know, when we f- first got serious about it, I mean, the- the aspect that we really liked is the fact that it is not tightly coupled to the model. So your- your model and data can change, which is- which is really good, which is very practical, which is what happens. Uh, so- so we started look- look at that. And then our first foray into this- this- this space was looking at the embeddings. How do you generate embeddings? So everybody was talking about RAG, but, you know, like, even before you get to the R of RAG, you need to do some preparation, right? I mean, your documents need to be prepared. And it just seemed to us at the time, um, that, you know, not a lot of people were talking about the preparation, and that could be a time-consuming, messy task. So- so we were like, "Well, how about we tackle this?" So- so we already have some capabilities within our portfolio that- that we use to operationalize complex AI pipelines, and we- we work with, uh, models that are obviously, you know, machine learning models that are trained with SAS, but we- we also use, uh, models that are trained elsewhere, right? I mean, we wanna give flexibility to data scientists. And this- this engine that we use is- is really versatile, really performing. So the first thing that we started to do was that let's- let's test if the- if the embedding pipeline, the pipeline to create, uh, embedding from documents can be manifested in this- in this engine. So we did that, and that- that seemed to work really well. And, you know, we were talking about customer scenarios emerging. So around the late summer of last year, we had a- a major manufacturer- manufacturing customer of ours, you know, approach us. And this is a, you know, large US-based manufacturer, kind of forward-leaning, and they already have, uh, you know... Predictive maintenance is a huge use case in manufacturing. And these guys run, you know, massive assets, and, you know, keeping up those assets and making sure they're not going down when not expected is a hu- a huge deal for them. So they already do, uh, you know, the typical things. Like, they already use machine learning on fast-moving sensor data, trying to anticipate anomalies and- and issues that might be happening. But what they were getting challenged is that this part is working fine. There's value getting created, but w- the output of this first part is- is a lot of alerts, right? And you have these alerts coming, and you're seeing that, you know, fault code coming up, this part. And then the next question is like, "Well, how do I address it?" And the answer to that question almost always is, "Go to documents." You know, you're talking about documents which are intense, dense. Manuals and maintenance logs, and which could be, like, hundreds of them, you know, fragmented across the company just depending on the scale of- of this. So then we started to kind of really dive into that. That was kind of our inspiration, that what if we converged these two workflows? Like, you know, the first part, which was like taking telemetry data, applying machine learning, and understanding what anomalies might be happening. And then once I have an anomaly, using that to kind of form a very sophisticated prompt that can then query across my knowledge base and come back with, uh, "Here is an actionable plan for you." Right? So this manufacturer's vision was to kind of eliminate the concept of fault codes and things like that, but just present a plan to their technician.So, that kind of pushed us forward. And we were like, well, this is an important, important capability. And we were like, this whole part of preparing data needs to be simplified. So then we started the journey of kind of developing a no-code user interface on top of it, which is what, you know... and we continued to iterate with it. And we started to look at options where it can be deployed in the cloud, can be deployed on-prem. We started to... A lot of our customers are in healthcare and government space. And, and some of them are 57 | 58 | 00:06:38,352 --> 00:07:05,751 [Speaker 1] 59 | averse to, you know, just kind of putting data wherever. Uh, so they wanted to kind of maintain control of it. So we started to look at, you know, what is the right architecture? And, you know, vector databases are just, just fundamental to this, right? I mean, you guys obviously know this, um, but I just want to make sure that everybody else [laughs] does. Uh, and we're like, well, you know, who are the leading vector databases? And that's how we kind of got connected and, and started. So, that's the, that's the historical perspective. 60 | 61 | 00:07:05,751 --> 00:07:37,402 [Speaker 3] 62 | Yeah, that... And, and just out of curiosity, so when... So, when was for you, like really you personally... Because, I mean, the whole deep learning wave, it's, is now happening for, what is it, like 12, 13 years, or 14 years maybe, that it just all started again? When was the... When was the time that you were like, "Okay, now, now we're serious. This is like... Now this is gonna change everything"? How, how long ago was that for you? Just for you personally? 63 | 64 | 00:07:37,402 --> 00:08:06,482 [Speaker 1] 65 | I would say personally, that was, uh, beginning of last year, right? Because I was, I was tracking the, the generative AI space. And I was, like I said, I was tracking like, the likes of ChatGPT and, and Claude getting, getting prominence. And everybody talking about, you know, millions of users on it. And, you know, being in a company like SAS, I always have, uh, more of an enterprise B2B lens. And I always try to kind of map like, you know, this is primarily a B2C led revolution, right? 66 | 67 | 00:08:06,482 --> 00:08:06,942 [Speaker 3] 68 | Mm-hmm. 69 | 70 | 00:08:06,942 --> 00:09:32,912 [Speaker 1] 71 | And what would it take to, you know, kind of make a connection back to the B2B enterprise side? And, and I thought RAG was this like really solid pattern that could, that could try to bridge this gap. So that, I would say the beginning of last year was number nine. Started to kind of really dive into this and had, had a little bit of a realization that, you know, look, for us to kind of take these powerful generative capabilities, but general-purpose capabilities, and make sure that they can unlock enterprise use cases, will definitely require an ability for them to speak to the enterprise data. And, you know, people were talking about, "Well, I can train a language model. I can fine-tune a language model." I mean, I, I, I don't know how many people are really seriously doing that. And, you know, then if... People still ask us about fine-tuned models. Uh, you know, we have a lot of customer conversation going on, and they're like, "Well, what about fine-tuned models?" I'm like, "If you're able to go ahead and fine-tune a model, it is not going to preclude RAG from being applied." You can still... I mean, that could be your model, right? Because your fine-tuned model still is capturing a snapshot in time, right? Unless you are fine-tuning it as fast as your data is changing, it is still going to get stale at some point in time. So RAG gives you a way to bring fresh data into it. So yeah, I would say, well, beginning of last year was kind of that realization for me. 72 | 73 | 00:09:32,912 --> 00:09:45,152 [Speaker 3] 74 | Yeah. And I... So I have a question around that. So I have with the... With... So, when I think about the enterprise, right? Just not specifically for AI, but in general. And, and I'm gonna 75 | 76 | 00:09:45,152 --> 00:09:53,612 [Speaker 3] 77 | overly generalize, but I, I, I, I'm just curious what your thoughts are on this. So the... I always think like in the enterprise, there, there are people 78 | 79 | 00:09:53,612 --> 00:09:59,012 [Speaker 3] 80 | whose job it is to figure out why you can't do something. 81 | 82 | 00:09:59,012 --> 00:09:59,791 [Speaker 1] 83 | [laughs] 84 | 85 | 00:09:59,791 --> 00:10:02,752 [Speaker 3] 86 | And then there's people whose job it is to 87 | 88 | 00:10:02,752 --> 00:10:03,781 [Speaker 3] 89 | do new stuff, right? 90 | 91 | 00:10:03,781 --> 00:10:03,791 [Speaker 1] 92 | Yeah. 93 | 94 | 00:10:03,791 --> 00:10:09,682 [Speaker 3] 95 | And then there's... So, and the first group is like, I, I don't know, might be risk manager something, right? 96 | 97 | 00:10:09,682 --> 00:10:09,752 [Speaker 1] 98 | Yeah. 99 | 100 | 00:10:09,752 --> 00:10:16,832 [Speaker 3] 101 | Or someone's like, "Are we... Did we think about, you know, uh, privacy?" Which, by all means, very important- 102 | 103 | 00:10:16,832 --> 00:10:16,932 [Speaker 1] 104 | Yes 105 | 106 | 00:10:16,932 --> 00:10:19,432 [Speaker 3] 107 | ... but, you know, but, but 108 | 109 | 00:10:19,432 --> 00:10:42,771 [Speaker 3] 110 | that group, I just want to, I want to, I want to park over here right now because that's like a... that I understand that and I appreciate that. It's very important that that needs to happen. But I'm actually interested in that, in that second group. What... How do you see or think that for that group that are building within the enterprise these AI applications or these RAG applications... 111 | 112 | 00:10:42,771 --> 00:10:57,281 [Speaker 3] 113 | How might the challenge be different for them to do things than, uh, for, let's say, a startup on the complete other end of the spectrum, building something AI-native? And again, I understand that, that, that first group, that... A startup doesn't have that first group, right? 114 | 115 | 00:10:57,281 --> 00:10:57,332 [Speaker 1] 116 | Yeah. 117 | 118 | 00:10:57,332 --> 00:11:00,892 [Speaker 3] 119 | So... But I... Everybody gets that, right? That's the thing. 120 | 121 | 00:11:00,892 --> 00:11:01,122 [Speaker 1] 122 | Right. 123 | 124 | 00:11:01,122 --> 00:11:03,021 [Speaker 3] 125 | I'm really thinking about it from a building perspective. 126 | 127 | 00:11:03,021 --> 00:11:03,051 [Speaker 1] 128 | Right. 129 | 130 | 00:11:03,051 --> 00:11:23,551 [Speaker 3] 131 | What are, what are challenges you're seeing in the enterprise, like why is it sometimes difficult for people... Or maybe it's not difficult at all. Maybe that's just not... Uh, maybe you, you, you think like that, that you haven't seen the difficulties. I'm just curious, why might it be difficult for those builders in the enterprise to get these AI applications off the ground? 132 | 133 | 00:11:23,551 --> 00:12:30,528 [Speaker 1] 134 | Yeah. No, first of all, you're absolutely right. There are those two camps and they exist in most large companies. So for, for the builders, I think, uh, I think the challenge was, uh, around... I think they would typically start with one of these frameworks, so you know, start with, you know, LangChain or likes. Um, they... A lot of them were able to tinker fast, right? I mean, they were able to kind of put things together and like, "Look, you know, here is something that I've created." But, you know, a lot of sponsorship in enterprise context comes from taking things to production and being able to kind of scale from there. And, and, and that was a challenge because these were...... somewhat flaky, uh, if, if you will. Why, you know, stuff that, that, that they put together and, you know, by the time they get to a point where, okay, I have this thing working, you find out that there's a new version of the framework. And the framework has changed under you, right? So it's a little bit of, I think, a mix of that, look, I need to be able to take this into production, uh, and some of these frameworks were 135 | 136 | 00:12:30,528 --> 00:13:30,898 [Speaker 1] 137 | great at building POCs, but not necessarily production level software. Um, you still require pretty serious engineering skills to them architect around it, to be, to be ready. I mean, that, that's al- also come back to do you have enough bandwidth and skillset? And I think the fact that, you know, there were so many choices, and frameworks changing all the time just kind of creates this, this, this challenge a- around like what exactly are we... What is our tech stack, uh, that we're building with, right? So ultimately that, that becomes the challenge. That, you know, it was, it was just not fast enough for them to, you know, take something from POC to production and then, then scale. I mean, this is not a new challenge. I mean, ed- enterprises face this challenge all the time, but this was no different, right? So that's, that's kind of what I s- what I, at least what I observed. We also work with enterprises that are, that are, you know, more in the way of more futuristic and they have- 138 | 139 | 00:13:30,898 --> 00:13:30,898 [Speaker 3] 140 | Okay 141 | 142 | 00:13:30,898 --> 00:13:41,988 [Speaker 1] 143 | ... the right in-house capabilities who have the experience of doing it. They were successful but they can't [clears throat] probably count them on, you know, my fingers on one hand. 144 | 145 | 00:13:41,988 --> 00:13:42,258 [Speaker 3] 146 | Yeah, yeah. 147 | 148 | 00:13:42,258 --> 00:13:47,348 [Speaker 1] 149 | Um, they were just so far and few. The majority of them were just tripping over it. 150 | 151 | 00:13:47,408 --> 00:14:24,137 [Speaker 3] 152 | Yeah, yeah. And actually, your, y- you, you, that's super interesting. And, and there was one thing in particular that I would like to double click on. You said something around the, um, uh, y- you said like the things that have changed because of AI which also mentioned the things that have not changed. And so for example I, I, uh, earlier today I had a conversation with somebody and we were talking about like the database agents that we have within, uh, the query agents within Weaviate and we talked about RBAC. And like traditional role-based access control that we said like, "That is something that's even for an agent that is just like unchanged." Right? So people just get access to stuff or they don't, right? 153 | 154 | 00:14:24,137 --> 00:14:24,148 [Speaker 1] 155 | [laughs] 156 | 157 | 00:14:24,148 --> 00:14:45,308 [Speaker 3] 158 | That is just, that is unchanged. I'm just curious in everything, uh, that you see happening with these AI u- use cases, what are you s- actually surprised of that has not changed? Where you actually go like, "Well, actually I was thinking this might have changed but actually didn't." It's like, "It is very traditional still." 159 | 160 | 00:14:45,308 --> 00:15:06,128 [Speaker 1] 161 | Well, I mean, I think it's probably a boring answer but the challenge with data has not changed. Every, like, all the, all the data challenges are still there because I think people still underestimate the, the amount of work it requires to get their data ready. Um, and, and, and that has, that has not changed. Um, 162 | 163 | 00:15:06,128 --> 00:15:46,968 [Speaker 1] 164 | I think the other thing that has not changed is, uh, the, the security overlay that is such important thing and, um, I think that, that concern around it, and you talked about, you know, this first group and second group. I think there's people in the first group who are, you know, who are meant to be risk averse. I mean, that's their role and they, they want to make sure that they have the right security infrastructure in place. So I think that has not changed, and that is leading to some interesting conversations because a lot of, you know, common wisdom is that I need to talk to language model. And you know, you often think about, well, I need to get one from OpenAI, or I want to get one from Anthropic. That does require fundamentally for you to 165 | 166 | 00:15:47,028 --> 00:15:52,488 [Speaker 1] 167 | get your data to them, right? I mean, whatever is goes in the prompt is, is being shipped to them. Right? 168 | 169 | 00:15:52,488 --> 00:15:52,548 [Speaker 3] 170 | Mm-hmm. 171 | 172 | 00:15:52,548 --> 00:16:49,328 [Speaker 1] 173 | So there are regulated industries which are like, you know, is this, is this right? You know, so that, the security first mindset, you know, is this something that we can, we can... Security or and privacy as well, right? I mean, it's not just about security. So that has, has not changed, right? I mean, there is, there is that still that I think it's probably a good thing, it's a rigor. Um, you could see that the second camp could be unhappy sometimes because it might slow them down. But, but I think the fact that much of data management, just data cataloging, that continues to be a, a challenge. And people who have invested in this, you know, are probably able to move for- faster than people who haven't. Which is true for a lot of AI and general enterprise applications. And the, the rigor that they put around security and privacy I think is still there. 174 | 175 | 00:16:49,328 --> 00:17:10,657 [Speaker 3] 176 | Yeah. Oh, yeah, that's, that's, th- that is, that is super interesting. And as, I was thinking if we, if we go to the reverse of that, right? So I think one of those great things I, I was recently I, I had it in a, in a, in a spreadsheet. So I was using, um, um, I was using, uh, um, a Google, uh, s- a spreadsheet so I- 177 | 178 | 00:17:10,657 --> 00:17:10,748 [Speaker 1] 179 | Mm-hmm 180 | 181 | 00:17:10,748 --> 00:17:19,628 [Speaker 3] 182 | ... they now have this new function in the spreadsheet that's like equal AI. And, and one of the nice things was like I wanted to make a list of something- 183 | 184 | 00:17:19,628 --> 00:17:20,367 [Speaker 1] 185 | Yeah 186 | 187 | 00:17:20,367 --> 00:17:30,828 [Speaker 3] 188 | ... uh, but the, it, the, the, it was in that, in, in, in that, in that spreadsheet in, in, in free text format. So basically in the prompt I said like, "Pick one of those, I don't know, five options." 189 | 190 | 00:17:30,828 --> 00:17:31,497 [Speaker 1] 191 | Yeah. 192 | 193 | 00:17:31,497 --> 00:17:49,627 [Speaker 3] 194 | And then based on these three text fields, create that structured data for me. And I found that extremely, extremely powerful. The... but it's of course interesting though that I, um, eh, that I noticed that here and there it made a mistake. So it, it was like a... it, it should have actually been the other option- 195 | 196 | 00:17:49,708 --> 00:17:49,878 [Speaker 1] 197 | Yeah 198 | 199 | 00:17:49,878 --> 00:18:00,228 [Speaker 3] 200 | ... um, uh, but it wa- was just too ambiguous or not done enough context or not enough information. And because me as a human reading the text that was in that cell- 201 | 202 | 00:18:00,228 --> 00:18:00,368 [Speaker 1] 203 | Right 204 | 205 | 00:18:00,368 --> 00:18:03,728 [Speaker 3] 206 | ... I knew that the, that it, that it was wrong. 207 | 208 | 00:18:03,728 --> 00:18:03,928 [Speaker 1] 209 | Yeah. 210 | 211 | 00:18:03,928 --> 00:18:26,639 [Speaker 3] 212 | So the... how are we dealing with... or how are you dealing with, especially in the end with enterprise customers is that-It, it feels to me that the, um, uh, the value that we're getting from AI, especially when it comes to RAG, so when the data goes into the model and then something comes out, that the, the cost of that, and I don't mean cost in, like, dollar amount cost- 213 | 214 | 00:18:26,639 --> 00:18:26,919 [Speaker 1] 215 | Yeah 216 | 217 | 00:18:26,919 --> 00:18:35,919 [Speaker 3] 218 | ... but the cost as in, like, uh, um, uh, um, uh, y- you know, the thing that we have to give into is, like, accuracy. So, we just need to accept- 219 | 220 | 00:18:35,919 --> 00:18:36,020 [Speaker 1] 221 | Yeah 222 | 223 | 00:18:36,020 --> 00:18:59,340 [Speaker 3] 224 | ... the fact it just cannot be 100% accurate. That's just a... It... Yeah, we have to accept that. How, how do... How big of a challenge do you see that in, in... with your Enterprise customers? Do you think that's more like an, an adoption cycle, or is it actually something that really needs to be resolved before people really can, you know, continue to implement these kind of solutions, especially for mission-critical, uh, applications? 225 | 226 | 00:18:59,340 --> 00:19:24,460 [Speaker 1] 227 | Yeah. That's a really, really good question, right? Because I think that's, uh, that's at some level kind of almost the, the unlock that needs to happen at some level. Because there is... You know, anytime you have a language model in the mix, there is some level of non-determinism that exists in your workflow, and you cannot, you cannot completely get around it. So, I think 228 | 229 | 00:19:24,460 --> 00:20:25,240 [Speaker 1] 230 | there's a couple of ways that, that we are trying to address. One is that, you know, how do we make sure that, like, from our perspective, our capabilities, our products offer the right information so that the users can, can trust it, that it's reliable, right? So, so we, you know, obviously are, from our perspective are investing in, in AI evals, so that, you know, we can allow the users to generate their own evals and, and test with it. Um, then, you know, every response that we create, we show citations. And we actually in our... In our product, we are able to kind of take those citations, so we see that, "Okay, here are the chunks that we pulled," and then we actually allow them to click on that chunk, and it goes to that document. Like, and you actually scroll to that document, to that page, like, "This is the paragraph that you just saw there," right? So, so, so just at feature level, stuff like that is, is building some level of, like, trust. Like, you know, this is... Okay, here is how the response was generated. 231 | 232 | 00:20:25,240 --> 00:22:14,399 [Speaker 1] 233 | But I think there is still a fundamental level of, like, "What am I comfortable letting this do?" Right? So, this is, this is still a, a big question, right? So, if, if I'm supporting a chat interface with a natural language, and then I think, "Yes, I- I'm good." I can... I am receiving the information, and I'm deciding the next course of action. I mean, this is something, like, it's synthesizing, it's summarizing for me, and I can, I can decide that, right? So, so that I think is... There is good adoption, right? I mean, chat-based assistants are, are doing well. But then you start to think about agents, and one key characteristic of an agent, uh, has to be autonomy. And I think that's where, that's where the, the, the challenge lies. So, what... At least what we are seeing is that, you know, the, the... The, the... I think there's two aspects, right? One is human-in-the-loop agents. So, you know, especially for mission-critical tasks, the agent will get to a certain point and then require a human to almost just say that, "Yes, this makes sense. Go ahead and do that." So, so that's one way that they are addressing it. And other way is that your agentic workflow could combine, uh, a more traditional, either a rule-based approach or a deterministic model along with, you know, a RAG call or an LLM call. So that, you know that... Let's say, if you're... You know, you talk about, let's say, a ... application somebody applies. So, you, you're not passing that to the LLM to make that, that determination. So, you will have, you know, a bit... You know, more, um, traditional, if I can call it, machine learning model that decides that. But in terms of, let's say, doing anything else, like, you know, creating the, the offer letter or whatever it is, you can obviously use, use a l- l- large language models to do that. So, I think that's kind of what we are seeing- 234 | 235 | 00:22:14,399 --> 00:22:14,409 [Speaker 3] 236 | Mm-hmm 237 | 238 | 00:22:14,409 --> 00:22:43,440 [Speaker 1] 239 | ... that, you know, keeping human-in-the-loop for machine-critical tasks where even though it's maybe a, a stamp of approval, but the user provides that. Or your agentic flows, um, could include more of a... more traditional deterministic models so that you know that those are proven, hardened, and I have tested them. Um, but I think that's... That is... I mean, we're s- still on this journey. We, uh... So, I don't know exactly what patterns could unlock in future, but that's at least what, what we are seeing. 240 | 241 | 00:22:43,440 --> 00:22:57,199 [Speaker 3] 242 | Yeah. And the... So that, that's, that's s- that's super insightful. And I think, I'm, I'm also c-... I just want to go a little bit more into the platform, actually, right? So, because, uh, I think that people listening are gonna be very curious about that, too. So, 243 | 244 | 00:22:57,199 --> 00:23:33,379 [Speaker 3] 245 | can you share a little bit what, uh... You, you already said a little bit around it with the tooling and the frameworks and people going to production. Can you... Can you share a little bit more about the design choices that you've made. And of course, and, and as, as Connor said in the beginning, we're super, you know, happy and proud to partner with, uh, with all of you on this... On, on the, uh... on this project. But I, I, I, I would love to understand, like, the design choices you've made, what you think is important, um, uh, es- especially from a SaaS perspective for the, for the end customers. And, and, and of course, we also would always love to learn, like, what you, what you like and enjoyed in, in building with our, uh, technology. 246 | 247 | 00:23:34,600 --> 00:25:22,568 [Speaker 3] 248 | Yeah, sure. So, we, we, we did try and be deliberate about our design choices. So, there were... There was, like, some keywords that I would, I would throw and talk about. So, we wanted to ensure that flexibility is one of our key design principles. So from the flexibility... I mean, it, it, it manifests in multiple forms. So one flexibility aspect is, we will let you deploy this where your data is, right? So whether that's in, in, in your cloud, whether that's on-prem, so, so we wanted to make sure that we, we support that. Two, we... In our pipelines, uh, when we are, like I said, preparing data for RAG is a non-trivial task and we wanted to make sure that was-That was handled with flexibility in mind. So we allow you to- allow you a choice of embedding models. Um, so, you know, whether... You know, we- so we bundle a set of embedding models, but we also, um, will shortly allow you to bring your own embedding models. And, and from that perspective, we are completely, I mean, transparent. We're open to, like, bring your LLM, bring your vector database. Although we have first-class integrations, but, like, if you want to choose a different language model provider, or different vector database provider, we respect that choice, right? I mean, that is as... Because we are dealing with enterprises. And in many cases, they already have a perspective that I have a partnership with XYZ, so you got to be able to work. So we, like, yep, we will, we will align to that. So that flexibility is an important principle. We really wanted to make sure that trustworthiness is a core element of this thing. Whether that's having a human kind of designing this workflow, whether that's the ability to design your own evils, or have an LLM generate an evil but you review it, and then you see the scores. And we actually follow this 249 | 250 | 00:25:22,568 --> 00:26:07,428 [Speaker 1] 251 | kind of a concept of a champion-challenger tournament type of a thing. So you can design multiple pipelines that convert your text data into embeddings. And then, you know, using your evils, or an LLM-generated evils, we allow you to score them. And you see, like, you know, at least... So the... on, on, on, on its own, let's say a number might not make sense, but compared to each other, the numbers will tell you something. So there is some quantitative basis now for you to c-... Like, okay, this pipeline using this embedding model and that chunking strategy does seem to perform better. So I am able to have some a- assurance that, you know, it's a good, reliable, trustworthy pipeline. So I will choose that. I talked about the citation. So trustworthiness was, was really important for this. 252 | 253 | 00:26:07,428 --> 00:26:24,128 [Speaker 1] 254 | The third thing I would say is the time to value aspect. So, we saw people struggle with this thing. You know, like, it, it took ages for them to get this thing up and running, and they were still... looked like, you know, there's some tape and straw kind of holding this together. So we wanted to make sure that 255 | 256 | 00:26:24,128 --> 00:26:31,787 [Speaker 1] 257 | the user interface was, was no code. And that's a challenging task, because this thing is 258 | 259 | 00:26:31,788 --> 00:27:12,908 [Speaker 1] 260 | non-trivial. I mean, there are settings and, you know, configurations that, that are just part of the... inherently this pipeline. So we wanted to kind of make sure that we allow you to make choices, but also have intelligent defaulting so that, you know, you can, you can move fast where you need to. But having the ability to kind of go from, you know, documents to embeddings within minutes using our interface, there's a time to value aspect that we wanted to keep in mind. Um, the last one I would, I would say is performance, because some of these workloads could be compute-intensive. Um, so there is a... You know, there's obviously a latency aspect to it, there's also, like, you know, how much infrastructure do you, you utilize? So we, if, we, we have some architectural 261 | 262 | 00:27:12,908 --> 00:27:30,528 [Speaker 1] 263 | patterns that we utilize to gain acceleration even when you are running on an infrastructure that's not accelerated. Uh, so you may not have GPUs in the infrastructure, but we can still gain acceleration, uh, due to some design choices that we made. So I think those were, like, our top 264 | 265 | 00:27:30,528 --> 00:27:36,288 [Speaker 1] 266 | design principles, and I think we've tried to really closely align with them. 267 | 268 | 00:27:36,348 --> 00:27:48,148 [Speaker 3] 269 | Yeah. That, that, th- th- that... Uh, all of that makes it, makes a ton of sense. And that is a... That's a... By the way, that's actually another thing where we, where you could actually say, like, that things haven't changed, right? So that- 270 | 271 | 00:27:48,148 --> 00:27:48,157 [Speaker 1] 272 | Yes 273 | 274 | 00:27:48,157 --> 00:28:12,528 [Speaker 3] 275 | ... time to value those... Th- that is just unchanged. And I think one of the things that I always find so, so fascinating, for example, um, if we, if we, um, uh... What we see, for example, with, with Weaviate, is the... I'm fascinated by the developer experience of, like, the clients. Um, um, so remind me again, which, which language are you guys, say, using again for the integrations? 276 | 277 | 00:28:12,528 --> 00:28:16,967 [Speaker 1] 278 | So we use, um... So all of the backend that we have is mostly Python. 279 | 280 | 00:28:16,967 --> 00:28:17,187 [Speaker 3] 281 | Yeah. 282 | 283 | 00:28:17,187 --> 00:28:26,487 [Speaker 1] 284 | Our front end is, is React, but, you know, that's, that's especially in this product. We also use a lot of Go in our backend and, and just in broader, broader company. 285 | 286 | 00:28:26,487 --> 00:28:33,368 [Speaker 3] 287 | Yeah, but... So Python is a great example, because I think it's safe to say that Python is like the lingua franca of, of, of- 288 | 289 | 00:28:33,368 --> 00:28:33,587 [Speaker 1] 290 | Yes 291 | 292 | 00:28:33,588 --> 00:29:03,628 [Speaker 3] 293 | ... AI, uh, nowadays. And what I find so fascinating also, if we look at the, at the Weaviate client, is like it's... We call it the Weaviate client, but it's actually not a client anymore. It's like more like a... It's like a library almost. It doesn't feel like that you're interacting with, with database, in my mind. Maybe you have a different point of view on this. But it's like, where you had, like, back in the days, like with, with SQL, that you were, like, literally writing SQL in the- 294 | 295 | 00:29:03,628 --> 00:29:04,038 [Speaker 1] 296 | Yeah 297 | 298 | 00:29:04,038 --> 00:29:15,558 [Speaker 3] 299 | ... code. And then you had that new generation that, of course, came with, with NoSQL, um, where you were still connecting drivers and stuff to a database. 300 | 301 | 00:29:15,558 --> 00:29:15,568 [Speaker 1] 302 | Yeah. 303 | 304 | 00:29:15,568 --> 00:29:24,087 [Speaker 3] 305 | And that's now kind of changing, that it feels almost like it's just a library. Oh yeah, and by the way, that offloads stuff to models and to a database. But it's... I just- 306 | 307 | 00:29:24,088 --> 00:29:24,266 [Speaker 1] 308 | Yeah 309 | 310 | 00:29:24,268 --> 00:29:30,268 [Speaker 3] 311 | ... I find it fascinating how that... the way that we write, uh, software is, is changing. 312 | 313 | 00:29:30,268 --> 00:29:48,668 [Speaker 1] 314 | Yeah, I 100% agree, right? So, I mean, I think abstraction is, I think, the key, key theme here that has emerged in, I would say, the last few decades. So I started as a programmer, and I started programming in C. And I still have nightmares when I think about the memory management that, that I would- 315 | 316 | 00:29:48,668 --> 00:29:48,678 [Speaker 3] 317 | [laughs] 318 | 319 | 00:29:48,678 --> 00:31:15,760 [Speaker 1] 320 | ... almost always mess up. And then I find that, you know, there is some memory leak, and then you would need this... some expert to be able to kind of review my code and find, like, you know, this is where you missed it. Um, so I think we've made, uh, a lot of, uh, progress, um, from that perspective. So, yeah, I, I absolutely agree. I think the abstractions that have been created, you kind of have similar looking endpoints, similar looking libraries. I think they go... They have gone a long way in democratizing this. Um-I mean, I think there is still... I mean, I still have, you know, people on the team who are, like, extreme C++ experts. And when there is time to design this, like, really, really important mission critical performance stuff, you're going to do it in C++. But a lot of the software doesn't necessarily need that. And the democratization and abstraction has, has really helped. Um, and I- I want to go back to the previous question. You asked me about, uh, you know, how I think we started working together, going through the design principles. I never answered that part of it. So when I think about, you know, obviously flexibility was important to us, but we wanted to make sure that... And the flexibility also has its cost, right? If you make something too flexible, like, okay, I have to make choices for everything now, right? So we wanted to offer the flexibility, but we wanted... We were very clear that we'll have some first-class integrations that we will start with, right? So we, we started to look at, you know, options with the vector database options, and 321 | 322 | 00:31:15,760 --> 00:31:40,930 [Speaker 1] 323 | you guys were, were a- an extremely good fit for what we were trying to do because obviously we wanted to offer a first-class vector database experience, and- and you guys have that. And two, the- the ability to, you know, kind of spin up in the cloud, spin up on-prem. I mean, that, that just totally aligned to, you know, what we were trying to do. And then, I guess I have to give shout-out to- to your team, Bob. I mean, you guys have been amazing, right? So working with, uh- 324 | 325 | 00:31:40,930 --> 00:31:41,640 [Speaker 3] 326 | Thank you 327 | 328 | 00:31:41,640 --> 00:32:09,760 [Speaker 1] 329 | ... Joby and Erica. I mean, they- they're, they're like a dream team. They're connected with, uh, my engineering team. Um, and you know, they talk to each other. They, they presented at conferences together. So I think it, it, it has really, it has really evolved, you know, very organically in, in terms of... Like, sometimes we've had these partnerships which are still quite, uh, abrupt here and there. But this has been... You know, it's flowed really naturally. So I- I- I appreciate your team for that. 330 | 331 | 00:32:09,760 --> 00:32:20,040 [Speaker 3] 332 | Oh, that- that's very kind of you to say, and I think i- it's a... From everything I heard, it's actually... It's, it's, it's mutual. So it's like a, uh... It was just... It's a great partnership, and I think 333 | 334 | 00:32:20,040 --> 00:32:53,600 [Speaker 3] 335 | what's also so important is that us, as a, as a, as a, as a company in such a new space, we're also learning, right? So we need to have that feedback from people like, "What's working? What's not working? What do we need to improve?" And those kinda things. And that's a... That was super... Th- th- super valuable. So that... We- we appreciate that a lot, actually, in the, in the, uh, collaboration. So that's, uh, that's, that's very kind of you to say. So, so what's, uh, what's next? What's, what's on... What's the plan with the platform? What, what... Where are you guys going? What's the, what's the plan? 336 | 337 | 00:32:53,600 --> 00:33:37,000 [Speaker 1] 338 | So the plan is to... You know, we have a lot of... So we just went GA with this product that we call SaaS Retrieval Agent Manager. And because we like an acronym for everything, we call it RAM. So we- we just went GA a couple of weeks back, and we have, uh, a lot of interest. Uh, I'm starting to see a fair bit of interest. So we... Primarily in the enterprise space, so large government, um, entities, healthcare, life sciences, manufacturers, banks that have interest in this. So I think what's next is, how do we convert this interest into some meaningful engagements that drive value for them and revenue for us? Um, so that's, that's really what's next. We're really trying to kind of... Because 339 | 340 | 00:33:37,000 --> 00:35:39,120 [Speaker 1] 341 | I think there's obviously a lot of noise, uh, you could call it hype about it. But there are not a whole lot of patterns or, like... You know, I have this enterprise use case where we have scaled this, right? So I think at some level, everybody's still learning to, right? And that's, that's fine. I mean, that's how it's meant to be. Um, so we are, we are also learning, um, you know, finding that... You know, look, people... Sometimes, you know, I'm demoing, and I find out that the kind of question I get is not about this fancy embedding model and look how easy it is to create, but it's about what is the cost to deploy this? Like, okay [laughs], let me go back and talk about that first. So we are starting to kind of think about more, just more pragmatic... Like what does an implementation look like? What is the total cost of ownership? What are the day-two steps right after I have started? So really kind of going back to the... And that, again, that has not changed, right? Kind of going back to our initial set of questions, that is still a, a key concern of, like, what's the total cost of ownership and things like that. So really what's next for us, I think are, are, are three things. One is, like, really focusing on converting these, uh, you know, interest into pipelines that we can then engage on. Two, I think we're trying to enable, uh, both internally and externally. So we are, you know, working with a partner ecosystem that allows us to kind of go fast. And three, this space is moving fast. So, you know, it... Just because you just went GA does not mean that you can sit for three months and just see what happens, right? So we are also managing a very active roadmap in terms of, like, you know, what's next on the roadmap. So there is... There's some areas where, you know, we have a customer request, like you ought to support this. I'm like, "Okay, that can be added to the roadmap." Some are a little bit more esoteric and they require research, so we are kicking off research, um, streams for that. So I think that's, that's, that's kind of what next. Uh, there is still a fair bit of building left, but I think we have enough that can engage the market. 342 | 343 | 00:35:39,120 --> 00:35:49,580 [Speaker 3] 344 | Yeah, and it's... So this is also something... You, you, you, you touch upon something super interesting, and it's okay. I was actually discussing this with somebody actually today. Um, um, 345 | 346 | 00:35:49,580 --> 00:36:13,480 [Speaker 3] 347 | I believe that the term that's often used for it is like the curse of knowledge. It's like if you work in this, this space for so long with so many things, you just... All these assumptions you make about like, you know, did, did people think about this? Did people think about that? And do they understand that... And then when you talk to people who're just excited about it, that as you mentioned, that they just ask 348 | 349 | 00:36:13,540 --> 00:36:35,144 [Speaker 3] 350 | really basic questions, right? So it's like a... And that's a, uh... And, and I sometimes, I find it so hard, but I'm curious if you have thoughts on this because...I do think it's fair to say that it's, that it's still early in the sense of, like, building with AI. But sometimes I feel 351 | 352 | 00:36:35,144 --> 00:36:41,584 [Speaker 3] 353 | it's even way more early than we might think, right? So, so it's like a, um, 354 | 355 | 00:36:41,584 --> 00:36:54,564 [Speaker 3] 356 | in the early days of the, of the web or something, right? Where, um, uh, uh, that we sometimes think that with, with AI, we're actually already in the time that, uh, uh, Amazon.com is, uh, released to- 357 | 358 | 00:36:54,564 --> 00:36:54,574 [Speaker 1] 359 | Yeah 360 | 361 | 00:36:54,574 --> 00:37:03,364 [Speaker 3] 362 | ... to make that analogy. But that actually it's even still earlier. That we're still, like, in the pre-, the, the pre-Google days, to stay in that, uh, analogy. 363 | 364 | 00:37:03,364 --> 00:37:03,684 [Speaker 1] 365 | Yes. 366 | 367 | 00:37:03,684 --> 00:37:15,964 [Speaker 3] 368 | I'm just curious how, how are you ... What do you think about that? How ... In, if you try to compare that, for example, with the early internet or those kind of things, how, how, how ... Where do you think we are? 369 | 370 | 00:37:17,844 --> 00:37:24,784 [Speaker 1] 371 | Uh, so I think we are, we are early. Um, I would say that we are early in the ... 372 | 373 | 00:37:24,784 --> 00:37:35,554 [Speaker 1] 374 | Like, I, I'm, I have to really scratch my memory for the early internet days. But there was that early internet time when I remember, uh, somebody ... You know, you remember the bookstores that, that used to be there. 375 | 376 | 00:37:35,554 --> 00:37:35,584 [Speaker 3] 377 | Yeah. 378 | 379 | 00:37:35,584 --> 00:38:26,424 [Speaker 1] 380 | And they started to get replaced with the, with the online stuff. So I think at, at that time, nobody could project, right? I mean, it, it was still somewhat of an organic displacement that was starting to happen. And nobody could project that how disruptive this could become, right? I mean, there, there were industries that were wiped out, right? I mean, there were, like, bookstores that just closed forever. We might be just around that time, right? Because we don't know how disruptive this can be and what job functions would fundamentally change, what companies may no, no longer exist, what companies may actually prosper and thrive. So, I think we might be even at that early stage. And I think I'll likely go one extent beyond that. And what is challenging is that ... So when that was happening, 381 | 382 | 00:38:26,424 --> 00:39:15,884 [Speaker 1] 383 | that was the only thing that was happening in that space. But here what's the problem is that the, the consumer adoption has happened quite fast, right? Again, the B2C aspect. So you are ... You have these enterprise companies and you have these executives that are, that are under pressure right now. Like, "What's my AI strategy? What am I doing with agents? How many agents do I have deployed?" All right? So it has become a little bit of, like, you know, "I need to do something about it as a, as a technology enterprise leader," as opposed to, like, you know, again, just going back to the fundamentals, you know, where do I need it? What workflows can be accelerated? Where does it fit, where it is not? So sometimes I think that is kind of just accelerating this, this, th- this rush to get on the bandwagon, which is making this 384 | 385 | 00:39:15,884 --> 00:40:13,524 [Speaker 1] 386 | early even look even harder, right? The fact that they're missing that it is early and sometimes it will take time to, to get to the right patterns. But because that there is, again, 700 million weekly average users that ChatGPT has, like, you know, how do I get some of that goodness in my enterprise use cases? So I think we are pretty early from the perspective of how much disruption can happen. But it is quite challenging given the, the massive adoption that has happened in the B2C space. And it is also challenging because some of the employees in these companies are using that, right? I mean, productivity tools are just there, right? So if ... I mean, obviously most of us are using that. And for them to be not able to just connect the dot, like, you know, how can this unlock my, my business use cases, uh, is a pain point, right? And that is making this, this early even more challenging. 387 | 388 | 00:40:13,524 --> 00:40:35,324 [Speaker 3] 389 | Yeah, that's, that's, that's a good, that's a good point. I, I think ... But this is also something that's not new for AI. Like, if you look, for example, back to the, to the, to the, to the, to personal computer, it was not the computers that people were working, uh, using at work that they wanted to use at home. It was the stuff that they were using at home that they wanted to use at work. 390 | 391 | 00:40:35,324 --> 00:40:36,004 [Speaker 1] 392 | Yes. 393 | 394 | 00:40:36,004 --> 00:40:44,893 [Speaker 3] 395 | And, and that's kind of what we see now with AI as well. It's not the fancy, you know, models that are running somewhere and it's, "Oh, would be nice if I could do it at home." No, no, no. 396 | 397 | 00:40:44,893 --> 00:40:44,903 [Speaker 1] 398 | Right. 399 | 400 | 00:40:44,904 --> 00:40:54,844 [Speaker 3] 401 | It's like a ... That, that ease of use of using the general-purpose models at home that they want to bring that to their, uh, to, to, to their work. And, and that is a- 402 | 403 | 00:40:54,844 --> 00:40:54,854 [Speaker 1] 404 | Exactly 405 | 406 | 00:40:54,854 --> 00:41:02,614 [Speaker 3] 407 | ... I think that's a very good signal that we, that we see that, uh, that that's happening. This is ... Oh man, thi- this has been fascinating. So, uh- 408 | 409 | 00:41:02,614 --> 00:41:02,614 [Speaker 1] 410 | Yeah 411 | 412 | 00:41:02,614 --> 00:41:17,004 [Speaker 3] 413 | ... last but not least, if people listening are curious and, and, and maybe working in the enterprise and they're like, "Hey, we wanna, you know, we wanna start to, to, to, to tinker around with, uh, with this new, uh, SaaS platform," where, where, where do they go? Where they ... How do they reach out to you? 414 | 415 | 00:41:17,004 --> 00:41:39,484 [Speaker 1] 416 | Yeah. No, that's, um ... That's a good question. So I think there's many ways to connect with us. I mean, saas.com/ram is the starting point. It's rather simple. Saas.com/ram gets you started in terms of, you know, what the product is about. Uh, I am, you know, on LinkedIn, so you can look me up, Sourabh Mishra SaaS. Uh, I think there's only one with those combinations. 417 | 418 | 00:41:39,484 --> 00:41:40,524 [Speaker 3] 419 | [laughs] 420 | 421 | 00:41:40,524 --> 00:42:07,884 [Speaker 1] 422 | So, you can find me pretty easily there if somebody's interested. And I'm always, uh, happy to connect and, you know, talk about, uh, what we can do. Uh, like I said, I mean, I think this is a ... There's still a lot of learning to do, so we are, we are in it together. And, you know, we're uncovering new patterns and, and new questions every day. And I think this is ... I mean, there's ... You know, we talk about challenges and all of that, but this is also just, like, super interesting time. This is like, you know, one of the best times to build, right? 423 | 424 | 00:42:07,884 --> 00:42:07,924 [Speaker 3] 425 | Yes. 426 | 427 | 00:42:07,924 --> 00:42:28,244 [Speaker 1] 428 | I mean, if you think about it. So I am ... I'm really thrilled with what we have built so far. And, you know, m- like we talked about that the space moves fast, so we cannot be, like, sitting on this, so we are building already the next layer of, you know, what way this goes. And really, really grateful for the, for the partnership that, that we have. 429 | 430 | 00:42:28,244 --> 00:42:49,594 [Speaker 3] 431 | Fantastic. Well, thanks. Oh man, wasn't this great, Connor? Yeah. Yep. Yeah. [laughs] Yeah, thanks so much, Sourabh. This was, this was great. We, we love to have these conversations. We, uh ... Which, what, where, what, which number are we in, uh, when it comes to podcasts now, uh, Connor? Which, which, which number wi- will this be? 129. [laughs] 432 | 433 | 00:42:49,594 --> 00:42:50,114 [Speaker 1] 434 | 129. 435 | 436 | 00:42:50,114 --> 00:42:52,324 [Speaker 3] 437 | 129 podcasts. Yeah. [laughs] Yes. 438 | 439 | 00:42:52,324 --> 00:42:54,144 [Speaker 1] 440 | Wow. You guys have been cranking. 441 | 442 | 00:42:54,144 --> 00:43:17,484 [Speaker 3] 443 | Yeah. Yes, we have had builders, scientists, startups, enterprises, you, you, you name it. It's like a, uh ... And it's like ... And, and, and, and Connor is running the podcast for so long already. It's like, it's fantastic. And it was absolutely amazing that I, uh, could join you, uh, Sourabh, to, uh, you know, to talk about ev- the great stuff you're building on, on, uh, uh, at SaaS with Ram. So that's great. 444 | 445 | 00:43:17,484 --> 00:43:48,203 [Speaker 1] 446 | Yes. No, likewise. I, I appreciate the opportunity. And I think, uh, we are looking to kind of get some customer wins now. And our, our goal is to kind of make sure that we are able to connect the dots. So, so working with your team, trying to make sure that we are able to kind of bring them in, provide the appropriate support that they, that the customers need. And, you know, kind of just going back those still new space, still new technologies at some level. So I think there's, there's a lot of room for us to kind of continue to partner in the future. 447 | 448 | 00:43:48,203 --> 00:43:52,453 [Speaker 3] 449 | Fantastic. Thank you so much. This was, this was great. [laughs] 450 | 451 | 00:43:52,453 --> 00:43:55,784 [Speaker 1] 452 | Thank you. Yes. No, I enjoyed it. Thank you for the opportunity and, uh, have a good one. -------------------------------------------------------------------------------- /data/podcast-transcripts/130.txt: -------------------------------------------------------------------------------- 1 | 00:00:00,140 --> 00:00:20,440 [Speaker 0] 2 | Hey, everyone. Thank you so much for watching another episode of the Weaviate podcast. I'm super excited to welcome Xiaogang Lin, a PhD student at the National University of Singapore. During his time at Meta, Xiaogang led the research behind Refrag, Rethinking Rag-based Decoding. We're super, super excited about this work at Weaviate, and I can't wait to dive into this further. Xiaogang, thank you so much for joining the Weaviate podcast. 3 | 4 | 00:00:20,440 --> 00:00:23,280 [Speaker 1] 5 | Yeah, great to be here. 6 | 7 | 00:00:23,280 --> 00:00:35,600 [Speaker 0] 8 | Awesome. So, uh, could we maybe kick it off, uh, for people who aren't already familiar with Refrag and m- what it is, the key innovations?Do you mind s- maybe sort of giving the, uh, the helicot- helicopter elevator pitch of what it is? 9 | 10 | 00:00:35,600 --> 00:01:38,860 [Speaker 1] 11 | Yeah, sure. So, um, basically, Refrag is a early attempt, um, um, that I did, um, to, um, accelerate the latency in, um, [smacks lips] our retrieval augmented generation. So, specifically if you think about, um, the current re- um, RAG systems, right, the latency is pretty high, um, especially when, when you talk about, um, um, doing this search agent, right? You, you have, uh, so many search result from the, uh, Google Search engine or, um, Bing Search. You have so many search result and you, you put those search result in a prompt, and that makes the, the LM inference really slow. And, um, the result of that is that user will be, um, having, um, very bad experience because they have to wait very long before you get the answer. So, um, Refrag is really a attempt, early attempt to, um, address this issue and, um, try to accelerate the late... uh, accelerate the LM inference, um- 12 | 13 | 00:01:38,920 --> 00:01:38,930 [Speaker 0] 14 | [clears throat] 15 | 16 | 00:01:38,930 --> 00:01:45,920 [Speaker 1] 17 | ... specifically designed for RAG system, um, so that user can have better experience. 18 | 19 | 00:01:45,920 --> 00:02:16,160 [Speaker 0] 20 | Yeah, amazing. I think there's so many cool details to it, like the, uh... You kinda mentioned the sort of time to first token improvement and, yeah, overall, I think there's a really interesting topic to go down about time to first token versus throughput. Hey, do you mind maybe just sort of explaining this concept of say, you have the 256-token chunk in the vector database, and then within it you have, say, 16, 16-token chunk vectors, and then you also have the token vectors. Maybe just explain the sort of like multi-granular representation. 21 | 22 | 00:02:16,160 --> 00:04:45,400 [Speaker 1] 23 | Yeah, sure. So, um, basically, um, in Refrag, right, the way that we design the system is, is that we, we kind of modeled the idea from the vision language model where people treat, um, um, vision, uh, like images, right, as a modality, and they kind of, um, use an embedding or a single embedding or several embeddings to represent certain images. Um, and, um, it's, it's kind of similar idea in our case, 'cause we treat, um, chunk as, um, a new modality, right? And, um, the way that we put chunk in a system is not by putting the, the raw tokens, um, like what we did for normal RAG system, right? We just, um, keep all the tokens from the chunk, um, uh, in, in the decoder input. Um, and then we just ge- uh, generate the answer. The way we do it is that we transform those chunk into one single embedding or several embeddings so that, um, we, we... Instead of using the raw token, we use those embeddings to generate the answer directly. And, um, the benefit of doing that is, um, is quite intuitive, because if you think about raw tokens, right? Let's say you have a chunk with 16 tokens. Um, it will take 16 tokens... Um, it, it will take 16, um, embedding, uh, token, uh, in the context, right? And that's really undesirable because, um, you can... you could have so many chunks, and, um, that would, uh, uh, make the context, um, super long. But, um, the way that... Uh, if you do... Uh, if you do it, uh, in our way, um, instead of putting 16 chunks directly in a decoder, now you can use one single chunk embedding to represent the whole chunk, and that would only take one single, uh, position in a chunk embedding, uh, in a, in the context window, right? And, um, so this is a reduction by 16 times if you think about it, right? Let's say you have, um, 16K, uh, context. Um, after using, uh, our Refrag, you only need to put 1,000 tokens in a context, and that's really a significant reduction in terms of the prompt, uh, the length of the prompt. And, um, yeah, that's really why w- we could have, um, those significant performance, um, boost, um, from our evaluations. 24 | 25 | 00:04:45,400 --> 00:05:06,830 [Speaker 0] 26 | Yeah, it's super interesting, and I, I had a lot of fun playing with these numbers too, mentioning going from 16,000 token vectors down to 1,024 chunk vectors when using, you know, K equals 16. But, say you use K equal 32, and now you have 512 vectors. If you just think about the reduction in the input length from 16,000 to 512, and, yeah, it's really amazing. 27 | 28 | 00:05:06,830 --> 00:05:06,840 [Speaker 1] 29 | Yeah. Exactly. 30 | 31 | 00:05:06,840 --> 00:05:27,180 [Speaker 0] 32 | And I guess, um, it just really inspires your imagination. I was... As I mentioned, I was playing with these numbers a little bit, and I went up to thinking about, like, 65,000 down to 2048 or 262,000 [laughs] down to 8192 as I'm just, you know, multiplying numbers by two and then dividing by 32, but, uh, I, I... 33 | 34 | 00:05:27,180 --> 00:05:27,240 [Speaker 1] 35 | Yeah. 36 | 37 | 00:05:27,240 --> 00:05:42,920 [Speaker 0] 38 | So, maybe my first question is, uh, uh... Or not my first question, but, like, just continuing on this, um, w- maybe even jumping ahead a bit, but what do you think about sort of the limits of this compression? What, what kind of things are you seeing as you really push long-context scaling with Refrag? 39 | 40 | 00:05:42,920 --> 00:06:22,910 [Speaker 1] 41 | Yeah. So, um, actually, we, we did a lot of experiment, um, in early exploration stage. Um, espcia- especially, um, in the beginning, you know? The, the, the initial idea is to use one single embedding for the whole chunk. And, um, let's say, like, in normal RAG system, right, usually one chunk could contain, you know, several hundred to one, 1K or, or several K tokens, right? One single chunk.... could contain that much, uh, that many, um, tokens, right? And initially, we want to use one single embedding for the whole chunk. And it turns out that the model, um, doesn't work. 42 | 43 | 00:06:22,910 --> 00:07:54,570 [Speaker 1] 44 | It doesn't work at all because you are- you are looking at a few hundred to 1,000 compression rate and, um, that's really the limit of, um, o- of, um, this kind of system 'cause we- we- we want to have a big compression rate, but, uh, we- we- we would also want to have good quality, right? We want to have a model that, um, has the performance that almost match the original model before the compression, right? So, um, initially, what we- we were thinking, oh, maybe this is because the compression rate is too high that the model, um, is not able to reconstruct the information from the chunky value. So what we did is that we kind of stringed, uh, you know, reduced the, uh, compression rate from a few hundred to, uh, 100 and we further reduced, um, the- the compression rate down to 16 and 32. And, um, so what we found is that if you use the compression rate of 32, your performance would be comparable to the performance of the language model that doesn't use any compression. But if you exceed that 32 compression rate, you would have a major performance degradation and that's really why we- we use this, um, 16 and 32 compression rate, y- because we- we are looking at this, um, you know, p- performance, uh, frontier, right? We- we- we want to have a low latency, but at the same time good performance. 45 | 46 | 00:07:54,570 --> 00:08:25,870 [Speaker 0] 47 | Yeah, amazing. I think just kind of one question I have that transitions right from this is, um, the idea of quantizing the chunk vectors, and so maybe I- you- I don't know how- what- I- I- sorry, I missed this detail when I- I did read the paper thoroughly, but I- I missed this detail about what kind of quantization is used in the chunk vectors. You think you could say, use, you know, binary quantization and things like this to further take down, say, the storage overhead as well as what might be the, yeah, losses as it goes through the decoder? 48 | 49 | 00:08:25,870 --> 00:10:17,510 [Speaker 1] 50 | Yeah. So, um, you- you know, in our Refract, we- we actually do not, um, do, uh, quantization for the, uh, the chunk embeddings. And, um, you know, the- the main focus is on, you know, the, uh, compression of the prompt, the- the length of the input. And, um, the quantization, um, like, w- we should kind of, uh... The quantization, uh, is considered when we try to do the pre-computation, right? Um, think about, um, the, um, vectors, right? If you want to pre-compute the vector for post chunks, we would have to think about, um, how to quantize these, um, chunk embeddings so that we use very few memories, um, in- in the disc to, um, store all these, um, pre-computation result. And in inference time, we could use those pre-computed vectors to do the inference directly and, um, but w- we didn't focus on that because, you know, our work is mainly focusing on- on the, um, the length of the input and, uh, all those quantization method that you- you see in, um, out there can be used to, uh, further, um, reduce the memory requirement for our system. But, you know, o- one thing that I- I- I wish to mention is, uh, for- for Refract, right, we actually provide two kind of result. The first kind of result is that if you do pre-computation for all the chunk embedding, right? Um, meaning that for all the chunks, you pre-compute all the embeddings. And then, um, at the inference time, you just retrieve these embeddings and then generate the result. The other way, um, to- to- to use our Refract is that you don't do pre-computation at all. Um, you compute all these chunk embedding on the fly, but in parallel. So, um, 51 | 52 | 00:10:17,510 --> 00:10:22,869 [Speaker 1] 53 | for that scenario, we- we also observe significant, uh, 54 | 55 | 00:10:22,870 --> 00:10:48,390 [Speaker 1] 56 | latency, uh, improvement compared to the uncompressed version. And, um, the- the main reason for that is because, you know, for our- our, um, Refract, right, we- we don't do a lot of, um, self-attention in- in the, um, previous stage. Um, think about, uh, our architecture. For tokens that is, um, not in the same chunk, right, they will never attend to each other 57 | 58 | 00:10:48,390 --> 00:11:48,190 [Speaker 1] 59 | and this is a reduction, um, um, in the- in the number of calculation for the previous stage, right? So, um, even if you don't, um, pre-compute all the embeddings and you do the computation on the fly, you still get a very significant performance boost. So that- that's, um, that's, uh, something that we found, uh, is pretty amazing because think about the production system, right? A lot of time, if you- if you store all these chunk embedding, it will be a huge cost for the, um, you know, for the memory, right? 'Cause you have so many chunks in the internet. And, um, if you- if you have to pre-compute everything, that would be a very significant cost. So a lot of time what, um, people do is that they pre-compute the chunks that was- that were most, um, frequently, uh, accessed, right? Like, maybe, um, top 10%, top 5%. And then 60 | 61 | 00:11:48,190 --> 00:12:22,558 [Speaker 1] 62 | at the inference time, if, um, the query, um, accidentally hit all the cache in the memory, then you could just use those chu- vectors from the, uh, memory to, um, to do the inference. But if- if it doesn't, because they are 95% to 90% of chunks that were not pre-computed, right, you have to compute the, uh, chunk embedding on the fly.... and, um, and our method could also accelerate in that use case, which, um, means a lot for the production system. 63 | 64 | 00:12:22,558 --> 00:13:01,078 [Speaker 0] 65 | That's really cool. I'm already loving this podcast, and I've already learned quite a few things. And, um, yes, I- I love that idea of, um, only pre-computing the top 10%, top 5%. We have ideas like a vector cache in Weaviate and things like this of keeping in memory, um, certain nodes that are often expanded. Um, uh, so there's one detail you just mentioned that I didn't understand that well when I read the paper, which is, uh, you're mentioning that... I think it's the block diagonal attention and you mentioned in pre-fill, uh, y- y- chunks don't attend to each other, so, so that means you have a custom kind of transformer for this part? Do you mind maybe just explaining that further for people who don't understand these transformer architectures as well? 66 | 67 | 00:13:01,078 --> 00:14:45,758 [Speaker 1] 68 | Yeah, sure. Um, so think about, um, a specific use case that, um, let's say we, we ask the question on, on who's the president of the US and, um, you could get five search results from the search engine, right? And, um, the way that people usually do rack is that they just concatenate those five chunks, um, into one single prompt. And then, uh, the whole prompt is, um, given to the transformer. And when you do the pre-fill, right? Um, what the transformer will do is that when, when you calculate the KV-cache, it will, uh, calculate the attention value between all the tokens in the prompt. So basically self-attention, right? Multi-head, um, uh, attention. And when you do that, you will calculate all the, um, pairwise, uh, you know, uh, attention value, um, in, in the whole, um, prompt, right? So that means any tokens in the prompt, like let's say any two token in the prompt, they will be attend to each other when, when computing the KV-cache for the pre-fill stage. But in our case, think about the chunk compression, right? The whole chunk is compressed into one single token embedding before we feed it to the decoder, which, which is like LLaMA and, and those models, right? So when you do this compression, um, the tokens within each chunk, they will attend to each other in the enc- in the encoder, right? In the encoder, they will attend to each other and arrive to a final chunk embedding. But what... In the decoder, only chunk embedding will attend to each other, not the token within the chunk, uh, within the chunk, right? Because those token, they are compressed into one single token. 69 | 70 | 00:14:45,758 --> 00:15:30,958 [Speaker 1] 71 | And that means on a chunk level, like, they only attend to each other in a chunk level. And this will reduce the attention calculation by a lot. And we argue that this, um, reduction in a- attention calculation is actually, um... is good because, um, the- the attention calculation, um, itself, um, for the tokens from different chunks, they are redundant. They are not necessary because we have revisualized, um, the, uh... these attention values and, and we- we- you eventually find that, oh, this attention value, they are very small and they shouldn't be calculated, um, in the first place. 72 | 73 | 00:15:30,958 --> 00:15:37,638 [Speaker 1] 74 | So th- that's why, um, we have this, um, argument here. 75 | 76 | 00:15:37,638 --> 00:16:01,898 [Speaker 0] 77 | And so I think, um, w- with that argument, it- it's kind of interesting. I... Like when I first started reading Refrag, I... So I always do, I'm sure everyone does this now, is you give the paper to Cloud or ChatGPT and Gemini, whichever language model, and you say, "Hey, could you summarize this?" And this was all the language models, this was their favorite takeaway, is this kind of, um, motivation of there's redundant information in the retrieved context. And therefore- 78 | 79 | 00:16:01,898 --> 00:16:01,948 [Speaker 1] 80 | Yeah 81 | 82 | 00:16:01,948 --> 00:16:39,998 [Speaker 0] 83 | ... that's why you have this shortcut around attending over the context. Um, so, so because... A- and then kinda... From our experience at Weaviate, I- I definitely think there's a lot of, like, you're ranking the candidate documents by their relevance to the query. There's oftentimes isn't... There's isn't really some sort of, like, diversity mechanism in there. But, um... And I don't mean to stay on this topic too long, but, uh, but just do you think this is because retrieval doesn't have too much sort of like, uh, relevance to de- de-duplication sort of, and ma- maybe if you had like a diversity, like a maximal marginal relevance or something that's trying to promote diversity in search results, maybe this would 84 | 85 | 00:16:39,998 --> 00:17:04,318 [Speaker 0] 86 | change it a little bit. And I think, I think it'll... Sorry, this is a long-winded que- long-winded question, but I think as we talk about this RL for selective expansion, and this is such a, a big part of this, but yeah. So, sorry, I don't know what the question i- is in that. Maybe just, like, sort of, do you think if there's diversity mechanisms in the search results that that might change it a little bit? Uh, yeah. 87 | 88 | 00:17:04,318 --> 00:20:00,088 [Speaker 1] 89 | Yeah. Um, so a- actually, for the RL policy, it's not, uh, really relevant to the, uh, divers- diversity consideration. Um, I would say, like, the diversity consideration you mentioned, it- it's really practical. You, you are right. A lot of, um, uh, system, these RAG systems, right, in the final layer, they will do de-dupe. Basically, um, if two chunks they are too similar, they will, um, they'll kind of cut one off, one, um, away from the, uh, final result, right? And, um, that would affect, uh, that would actually affect, um, the, um, attention values between dif- different chunks, right? Uh, this is really, uh, why we were saying that for tokens for dif- from different chunks, right, they have smaller attention value because those tokens, they're actually from very different sentences, right? And, um, and if... They are not, they are not coming from the same natural language sentence, right? They are from different, uh, snips, from different ar- articles, right? So that's really the fundamental reason why these tokens, when they attend to each other, they have very low, uh, attention value because they are not from the same, uh, sentence or the same article.So, um, that really affect, um, um, the, the, the attention value in, in a pre-fill. And, uh, and also why we, we would, uh, we would want to do the, um, compression, right? 'Cause, um, this attention value, they, they are not... They are redundant. And o- on the other policy, I, I, um, it's really a different com- consideration. Um, so the, the reason why we were thinking about this, our policy, is because when, when we, um, when we train our model, right, um, we, we figure out that for the chunks with very complex information, the decoder wouldn't be able to understand it, um, very well. So, let's say for one chunk, if it contains numbers and we ask the decoder to repeat the number, um, in, in the, using that chunk embedding, right? We... What do we observe is that, um, the numbers, they will be, um... They, the decoder wouldn't be able to reconstruct the number exactly, especially if the number is super complex and very long. So, um, that's really, um, make us, you know, uh, thinking, uh, that, uh, maybe this compression is not perfect, right? 'Cause, um, even though we, we, uh, match the performance or, like, have a small, very small, uh, uh, uh, degradating perplexity, but, um, when we actually use the model, um, the decoder cannot, uh, understand complex chunk information, especially when the chunk, uh, contain very complex information. So, um, we have the idea that not all chunk they're created equal. 90 | 91 | 00:20:00,088 --> 00:21:02,528 [Speaker 1] 92 | 'Cause, um, some chunk, they have more complex information than others, like numbers. Uh, but some chunk, they only contain semantic, high-level semantic. Right? And, um, in that case, we believe that for those chunk, uh, chunk embedding should be sufficient to express the high-level stuff. But when, when it comes, comes to those very detailed numbers and very detailed informations, we think there are more entropy in those, um, in those chunk, and we, we shouldn't compress it because doing compression really hurts those very detailed information, and the decoder wouldn't be able to repeat or, like, even reconstruct. So, um, the R selection policy, uh, uh, the R selective, uh, compression is really to address this problem of, um, different chunk having different amount of information. And we believe that, um, we can train a policy to automatically decide which chunk are more informative than other chunks. 93 | 94 | 00:21:02,528 --> 00:21:26,488 [Speaker 1] 95 | And for those, uh, more informative chunks, we put the raw token in the decoder. But for the less informative chunks, we put a single chunk embedding in the decoder to represent the whole chunk. So, this really, um, further boosts our, the performance of our model because now we, we, we n- we have a, a simple policy to decide, um, 96 | 97 | 00:21:26,488 --> 00:21:35,408 [Speaker 1] 98 | whether we should compress or not. And that would make the generation quality much better. 99 | 100 | 00:21:35,408 --> 00:22:55,948 [Speaker 0] 101 | Yeah, amazing. I, I wanna come back to talking about how you train the expansion policy, but just reflecting on that idea of kind of, uh, expanding out one chunk because it contains a number, this is maybe sort of the boldest idea I sort of had as I was reflecting on Reffrag, was sort of, are vector embeddings all you need? And the sort of... So, so kind of I was thinking about, um, there's a big argument around using LMs to extract structured data from unstructured data. Say I take the markdown of my blog and I extract date published or the authors. Compared to the sort of long context, uh, uh, it just, you just give all your data to the LLM. And now that it... Now that all your data has these vectors associated with it, they can expand the parts of the, uh, parts of the chunks that contain that date published information, especially as we talk about these huge compressions. Like, I could have... Yeah, I, I don't have that quick lookup, but say 262,000, I think, goes down to 8,000 tokens when you have K=32. But... And you can imagine all sorts of combinations like this, but that kind of ability to just expand out each of the chunks that would contain the date published, I think it's a, a really profound idea if we're thinking about the future of database systems with these kind of, uh, yeah, these kind of Reffrag long-context architectures. Yeah. 102 | 103 | 00:22:55,948 --> 00:22:58,508 [Speaker 1] 104 | Right. 105 | 106 | 00:22:58,508 --> 00:23:12,428 [Speaker 0] 107 | Yes, I guess, um, I, I guess, I'd love to just get your, your quick take on that, if you think that, like, this could be, like, sort of... I- if you c- if you could scale this really tremendously to maybe this is, like, a totally new kind of database system. 108 | 109 | 00:23:12,428 --> 00:24:51,168 [Speaker 1] 110 | Yeah. Um, I, I think, uh, the way that, uh, Reffrag designed the rack, right, um, could affect the, uh, the way that we, we design the, um, database system, um, nowadays, right? Uh, uh, so again, um, s- for... I, I would like to mention, I, I'm not an expert in database, and I, I know very few, uh, technical stuff in this field. But, uh, I do think that Reffrag will... would, um... If, if it's... If we could put, uh, Reffrag in a production, we could potentially change the way, um, how we serve the rack and, uh, the way we interact with the database. And, um, 'cause p- uh, think about the, the, um, database, uh, we use for, um, rack system right now, right? Um, in the final layer, we would do the, um, retrieval using the embedding, and we use the query embedding and then, uh, for the chunk embe- and, and we also, uh, compute those... uh, pre-compute all the chunk, the embedding for the chunks. And, uh, when we do the retrieval, we just, uh, do a K-NN search, right? And then, uh, to, to get the k most relevant, uh, chunk embedding. And... But, you know, for the current system, we, we, we just throw all this embedding away.... with... 'Cause, um, we only put the raw token in the decoder, right? So those embeddings, they will actually tr- uh, throw away. And a lot of time, you know, for those production, uh, system, they, they use, uh, heavy quantization on, on those, on embeddings because, uh, for the k-n, uh, result, right, they, they only need to get, uh, an, a approximately correct [children shouting] [clanging] uh, k, uh, 111 | 112 | 00:24:51,168 --> 00:24:51,178 [Speaker 2] 113 | [laughs] 114 | 115 | 00:24:51,178 --> 00:25:00,508 [Speaker 1] 116 | ... top-k, uh, rather than, uh, uh, result, right? So I, I believe that, um, if we really want to use Refract, we have to reform this, um, 117 | 118 | 00:25:00,508 --> 00:25:31,628 [Speaker 1] 119 | or reconfigure the database, so that, um, instead of focusing the chunk, uh, the, the, the token in the chunk, we should focus on, um, the vectors in, in the database. And because in our Rec syst- in our Refract, we, we actually need, uh, the full position, uh, vectors, right? Um, but, but for the current system, those embeddings, they, they are... they don't really matter. They, they do heavy quantization, and, uh, they, they also do, uh, 120 | 121 | 00:25:31,628 --> 00:25:56,548 [Speaker 1] 122 | um... they, they throw it away when doing inference. So, um, w- w- if, uh, Refract really, um... is really used in production, m- maybe we, we need to, uh, think about how, how we should... how, how we should deal with, um, a database that focus more on the... on the vector itself instead of the raw tokens. 123 | 124 | 00:25:56,548 --> 00:26:34,858 [Speaker 0] 125 | Yeah, amazing. Yeah, I, I, I think obviously kind of like with Weaviate and our vector database, we're super excited about just m- more user-restoring vectors in this totally new application. But I just like, sort of inspired by long context generally, I think there's sort of this really interesting intermediate query in addition to sort of k and n for, like, question answering. And then there's also kind of like structured retrieval where you have, like, an exact property or like attribute, like price is a float and it's either less than 2 million or, or greater, like things like this. Let's say you're searching through houses or so- something like that. But then I think there's kind of this inter- this interesting like- 126 | 127 | 00:26:34,858 --> 00:26:34,858 [Speaker 1] 128 | Mm-hmm 129 | 130 | 00:26:34,858 --> 00:26:50,068 [Speaker 0] 131 | ... intermediate query where say you have a database of images and you're saying, "How many have a nice, a nice pool that looks like this pool?" And it's like... it's not something that's pre-computed. It's gotta compute it on the fly and then it's gotta do it to an absolutely enormous scale. That's kind of where I see this eff- 132 | 133 | 00:26:50,068 --> 00:26:50,568 [Speaker 1] 134 | Mm-hmm. 135 | 136 | 00:26:50,568 --> 00:27:25,347 [Speaker 0] 137 | This is like a new... another new application. But anyway, so, so yeah, so thanks for taking that detour on me. I think maybe this topic of vector embeddings are all you need is like a funny [laughs], a funny kind of takeaway. And, but I think, I think definitely more future-looking of a thing, but I wanna kind of come back into, yeah, the, the RL expansion policy and how you train it to sort of just come understand the mechanisms more. I'm curious... I'm very curious about the sort of sequential selection of chunks. Uh, do you see that as being a latency bottleneck? Um, yeah. 138 | 139 | 00:27:25,348 --> 00:28:05,708 [Speaker 1] 140 | Yeah, that's a good question. Um, actually, uh, when we designed this, our policy ready, um, we were thinking about latency, uh, consideration. 'Cause if you think about sequential selection, right, you would have to do it... Um, so let's say we have n chunks and we want to select m chunks to uncompress. Um, so we have to roll out the policy for m times to eventually, uh, get the m chunks to, to decompress, uh, to uncompress. So, um, so the... W- w- we did some, um, optimization to mitigate that problem. And the way that we do it is that, um, it's kind of like point network. Um, 141 | 142 | 00:28:05,708 --> 00:28:51,888 [Speaker 1] 143 | so the, the, the... I, I don't know if you're familiar with point network. The way that it, it does is that they only forward, um, the... So for the policy network, right, they only do one forward. And after that one forward, they could automatically decide, um, what are the end, uh, s- uh, end steps to take, um, by doing masking. So, um, let me give you an example. Let's say we have a number of, uh, embedding as input, right, uh, to our policy network. And, um, our policy network will eventually output a, um, multinomial distribution on this end number of, um, a number of, uh, chunks. And for that multinomial distribution, um, 144 | 145 | 00:28:51,888 --> 00:30:02,028 [Speaker 1] 146 | in, in the first step we pick one chunk, um, from... by sampling from that multinomial, uh, distribution. After we sample that chunk, uh, we don't re-forward, uh, the... do another forward for the policy network. What we did is that we take the logins from the output, and then we mask the position that we already selected, and we do, um, softmax again, to, to get another multinomial distribution. And we sample from that, uh, for the next action. So basically, rolling out m times only means we do m sampling from, uh, a distribution that was masked and renormalized. And, uh, this is really, uh... I, I think it's, it's, it's a quite common way to, uh, to mitigate, uh, the, uh, the latency, um, in, in, in that field. So, um, we, we just decided to adopt that, um, solution 'cause, um, you know, doing multiple steps, the main bottleneck is that you have to re-compute the... the, um, the output for many times. But now we, we save that, um, um, steps and, um, 147 | 148 | 00:30:02,028 --> 00:30:08,468 [Speaker 1] 149 | do not add, um, additional, um, latency to the system. 150 | 151 | 00:30:08,468 --> 00:30:23,978 [Speaker 0] 152 | That's amazing. Yeah, I wish... Uh, I, I can't wait till the sort of video generation tools get to the point where I... where we can take that clip, your explanation of that, and then just give it to say Veo and it'll output one of those three blue, one brown animations of this multinomial sampling. 153 | 154 | 00:30:23,978 --> 00:30:24,008 [Speaker 1] 155 | [laughs] 156 | 157 | 00:30:24,008 --> 00:30:31,488 [Speaker 0] 158 | I think that'll be such a cool thing for these technical podcasts. So, but anyway, so, so yeah. Uh, s- sort of, uh, transitioning a little bit- 159 | 160 | 00:30:31,488 --> 00:30:31,498 [Speaker 1] 161 | Yeah 162 | 163 | 00:30:31,498 --> 00:31:03,876 [Speaker 0] 164 | ... maybe coming out of just training the reinforced learning expansion policy, and then, you know, and the policy itself, if we could talk about this sort of like-... a four-stage training algorithm for training Refrag. I think we maybe glossed over it as we were just, you know, talking about all these topics. But, um, you- you first have this thing where you take the encoder and you project the embeddings into the decoder's latent space, uh, and- and then you have this sort of reconstruction task, continual pre-training. Uh, do you mind just kind of explaining these stages of, uh, training Refrag? 165 | 166 | 00:31:03,876 --> 00:32:09,096 [Speaker 1] 167 | Yeah, sure. So, um, all these stages... So the- the main, uh, purpose of, m- our work is actually to develop- develop this, uh, four-stage training recipe. And the goal of our work is really to, um, have a general res- training recipe so that we can adapt any decoder and encoder combinations to, um, t- to work like our system, uh, but like what we- we- we want, right? And, um, so but the- the main, uh, you know, the- the main problem that we need to address is that those encoder model, right, they do not, um, uh... Those decoder model, right, they do not naturally understand the- the embedding from the- the- the encoder. And this is because they were not trained, um, together during the pre-training, right? So, the decoder naturally do not understand the chunk embedding. We need to find a way to align the encoder and decoder so that decoder could understand it, the- the embedding information, and actually use that information to help us, um, to answer the question. So, 168 | 169 | 00:32:09,096 --> 00:32:34,056 [Speaker 1] 170 | um, to do that, you know, um, we- we actually tried, uh, a lot of stuff. Um, if you just train, uh, train the rack, um, SFT, um, using supervise- uh, supervised fine-tuning on- on encoder and decoder for the rack tasks, it doesn't work because, um, the decoder would- uh, would, uh, find an easier local optimum by ignoring all these chunk embeddings. 171 | 172 | 00:32:34,056 --> 00:33:45,475 [Speaker 1] 173 | So, that's really why we would want to use the reconstruction task because we want the decoder to force... Um, to be forced to, um, attend- to attend to the chunk embedding and then actually use that information to help us to generate the answer. And we find that- that the reconstruction task is really a- a good way to- to force the en- the decoder to pay attention because it, um, if the decoder could pay attention to the chunk embedding and, uh, actually understand the chunk embedding, it should perfectly reconstruct everything that was encoded in- in that embedding, right? So, that's our assumption. And, um, so for our reconstruction task, we just chunk a- a big, uh, a- a very long text and then, uh, compute the chunk embedding, and we just ask the decoder to use those chunk embedding to, uh, reconstruct what's, uh, inside the chunk embedding. And that really helps a lot for the decoder to pay attention, um, to the- to the, um, chunk embedding. And during the process, we found that, um, that curriculum learning is also important because, um, 174 | 175 | 00:33:45,476 --> 00:35:29,836 [Speaker 1] 176 | they- they are different level of, uh, difficulty, um, in- in terms of reconstruction. So, think about the decoder take one chunk embedding versus the decoder take 100 chunk embedding. These two tasks, they are very different and in- in terms of, uh, difficulty level, right? So, um, let's say one chunk have 16 tokens. If you reconstruct 16 token from one chunk embedding, that's actually a very easy task for decoder. But if you... I- if you want the decoder to reconstruct, uh, maybe 2K token from 128 chunk embedding or 200... And that- that- that's actually very difficult because, um, the decoder was never, um, trained to take so many chunk embedding in the first place. So, it... We- you would have to, um, learn this process step by step. So, it- it will take a look at the, uh... It- it will start learning with fewer chunk embedding, and as the training pro- progress, we will want to put more chunk embedding in the input so the decoder can, um, slowly adapt to more chunk embedding and- and eventually, um, arrive to the checkpoint that we want. So, these- these techniques, they actually, um, vital to their... To the successful, um, of the alignment. And so what we found is that if you don't do these two task, like remove e- any of these two, uh, um, tasks, you wouldn't be able to get a- a good alignment. And a lot of time, the decoder would just ignore all these chunk embeddings. So, we found these two stage very useful and, um, to some extent, they are necessary in- in our case. Yeah. 177 | 178 | 00:35:29,836 --> 00:35:30,766 [Speaker 1] 179 | So, these are- are the two, um- 180 | 181 | 00:35:30,766 --> 00:35:31,496 [Speaker 0] 182 | That's super interesting 183 | 184 | 00:35:31,496 --> 00:35:44,375 [Speaker 1] 185 | ... um ... you know, sir? Yeah, yeah. Th- these are two- two main training recipe for, uh, getting, uh- uh, getting the decoder to understand a chunk embedding. 186 | 187 | 00:35:44,375 --> 00:36:13,756 [Speaker 0] 188 | Yeah. Sorry to cut you off of it. I think our- our latency a little bit as we stream our podcast [laughs] around the world, uh, cut me off a li- or confused me a little bit. But anyways, um, yeah, it's so interesting to hear about the curriculum learning. I remember kind of like, um, when, uh, MosaicML had their MPT, uh, 65K and they were doing... I think it was like alibi attention and rope scaling and- and this kinda thing of, yeah, we just took the pre-trained language model and now we're introducing long gen- long inputs into the data and that, like, you- 189 | 190 | 00:36:13,756 --> 00:36:13,766 [Speaker 1] 191 | Mm-hmm 192 | 193 | 00:36:13,766 --> 00:36:58,596 [Speaker 0] 194 | ... you have this curriculum learning. It's something that I could see maybe being a part of the DSPy story. I- I have to put a DSPy reference in all my... in all the podcasts, but, um, uh, yeah. So, uh, so talking... I wanted to ask more about the mechanics behind trading the decoder with the encoder. Um, maybe also on DSPy where now I- I really am excited about the Arbor tool and how Arbor is letting us use GRPO with models in VLLM. Um, but to then also put the gradients from the decoder back to the encoder is-... uh, so would you... You would have to have the same PyTorch... You had to use say, PyTorch or something like that, right? Y- c- y- t- ho- like, ha- how particularly do you implement this, where you take the gradients from the decoder and put 'em back to the encoder? 195 | 196 | 00:36:58,596 --> 00:37:44,696 [Speaker 1] 197 | Yeah. It a- actually, it's very, uh... You know, PyTorch is really a, a powerful, uh, you know, tool. Um, uh, you don't actually need to do anything. Just make sure that everything is, um... everything is, um, uh... e- every tensor in, in the n- network has a gradient, and it will automatically prop and, uh, just get a back propagate to the whole network. So, um, what I did is that I just take the, um, encoder output and then do a projection to the token embedding, and then put that token embedding in the decoder input and in the decoder input. So, the whole system has, um, uh, gradient information and the PyTorch will, uh, automatically, um, do the back propagation for you. 198 | 199 | 00:37:44,756 --> 00:38:47,936 [Speaker 0] 200 | S- s- uh, have you, have you also maybe looked into say, using like, Hugging Face, uh, could make it maybe easy to... Because I'm, I'm thinking about, uh, PyTorch sometimes m- 'cause I'm trying to grab a pretrained LLM and a pretrained embedding model, and I'm trying to now throw them into my system. Um, do you think... Y- yeah, I d- I don't know what the current state is between PyTorch versus Hugging Face and what's easier to load pretrained weights. So, just be honest, I'm mostly using vLLM and I haven't really thought too much about, about how you're connecting these things. But, um, uh, yeah. So I guess my question would be, uh, and maybe just kind of coming into the tooling, I think the big question would be, uh, w- t- when we see Refrag LLMs, is the API gonna have an embedding model as well as an LLM, you think? Do you think it'll be easy to sort of plug and play, bring your own embedding model, bring your own LLM, and then there's some like, Refrag comp- pilation layer on top of... How, how do you see these models sort of spreading into the 201 | 202 | 00:38:47,936 --> 00:38:51,036 [Speaker 0] 203 | market if... Yeah. 204 | 205 | 00:38:51,036 --> 00:38:55,996 [Speaker 1] 206 | Um, are, are you asking about the open source of the model or, uh, like open source of the code- 207 | 208 | 00:38:55,996 --> 00:38:56,006 [Speaker 0] 209 | Yeah 210 | 211 | 00:38:56,006 --> 00:39:03,736 [Speaker 1] 212 | ... so that the community could, um, just, just try on, on their, uh, model and, and on their database? 213 | 214 | 00:39:03,736 --> 00:39:06,396 [Speaker 0] 215 | Uh, yeah. Sorry. It was [laughs] I was just like, thinking out loud. 216 | 217 | 00:39:06,396 --> 00:39:07,176 [Speaker 1] 218 | Yeah. So the- 219 | 220 | 00:39:07,176 --> 00:39:07,956 [Speaker 0] 221 | Uh, maybe we could- 222 | 223 | 00:39:07,956 --> 00:39:24,136 [Speaker 1] 224 | It's a, it's a tricky question. [laughs] Um, 'cause you know, I, I actually... I, I have done all the, all the work, but, uh, w- we are working, uh, super hard to, um, to push that, um, to, uh, the, the open source community. And, um, 225 | 226 | 00:39:24,316 --> 00:40:00,296 [Speaker 1] 227 | so just stay tuned. We, we will have that, um, available very soon. And, um, yeah, I, I, I think, uh, if you take our code, um, you could just, um, run on any Hugging Face pretrained decoder and encoder 'cause our model, um, our repo is built on top of those, um, Hugging Face APIs and, um, transformer APIs. So, um, you, you could just, um, take a, uh, take a checkpoint from Hugging Face and directly use in our system to train your own encoder/decoder. 228 | 229 | 00:40:00,296 --> 00:40:12,476 [Speaker 0] 230 | And so then I guess maybe just asking about the sort of computation that that might require for open source. Uh, like how m- like how many GPUs am I gonna need to... uh, and, you know, GPU hours? [laughs] 231 | 232 | 00:40:12,476 --> 00:40:22,636 [Speaker 1] 233 | Yeah. So, um, that, that's a good question. Um, actually for, um, the 7B model that I tried previously, um, 234 | 235 | 00:40:22,636 --> 00:40:32,596 [Speaker 1] 236 | y- you could run it on a single machine with eight cards, and... but that would take you, um, maybe, uh, 237 | 238 | 00:40:32,596 --> 00:41:16,996 [Speaker 1] 239 | five to s- uh, five to six days, um, i- if, if you train on one single machine. Um, so, uh, for the whole... uh, like for all the training stages. So I, I, I think if you, um, if you really, um... if, if you don't have a lot of resource, right? For example, one G- one, one machine with eight GPUs, um, you could, you could train our model very easily, but it will take some time. Yeah. It will take a few days to finish all the... to, to finish all the, um, all the stages. But if you have more GPUs, like, um, maybe, um, eight machines with eight cards, you could finish it in one or two days. 240 | 241 | 00:41:16,996 --> 00:42:02,196 [Speaker 0] 242 | That's pretty exciting. Yeah. I think, uh, I, I, I'm super excited to just see the sort of releases of these models and... Yeah. I nee- I, I'll admit I need to dive into the code more. I've been sort of looking at this from just like, trying to wrap my head around the algorithm. But yeah, I'm definitely really excited to be diving into that. Uh, yeah. May- so maybe we could also just talk a bit more about some of the experiments you did in fa- so, uh, so we've been s- so I mean, I think that it's absolutely amazing that it would take you five to six days single machine to do that sort of reconstruction task to align the encoder and the decoder and this continual pre-training to get the decoder used to attending over these compressed chunk embeddings. Uh, but then there's this, um, sort of downstream task tuning phase, and I wanna just maybe talk... like, ask more about your experience with, uh, evaluating- 243 | 244 | 00:42:02,196 --> 00:42:02,206 [Speaker 1] 245 | Mm-hmm 246 | 247 | 00:42:02,206 --> 00:42:09,936 [Speaker 0] 248 | ... Refrag on question answering, multi-turn conversation and summarization, and just what you think about that sort of last leg of the mile? 249 | 250 | 00:42:09,936 --> 00:43:05,312 [Speaker 1] 251 | Yeah. So, um, actually in our project, we don't focus on th- on those, uh, downstream tasks 'cause we, we feel that, um, those downstream tasks, they, they are, um... they are kind of, um... it, it's, it's, it's actually difficult to evaluate, um, uh, on those tasks and, um, to, to give a fair comparison. And, um... but we, we try our best to do the benchmark on those, um, downstream tasks like long, long document summarization, where we, um, put the whole document into-... into the context, which chunk the whole document, and then we ask the decoder to summarize what's in the, what's in the document. And we train, uh, the uncompressed LLaMa model, uh, and we also train the compressed model using our, uh, Refrag. And, uh, it turns out that, um, 252 | 253 | 00:43:05,312 --> 00:45:18,071 [Speaker 1] 254 | we get pretty good result when thinking about the same latency. So, uh, if you think about the same latency, right, um, for our model, we could put more context in a, in the context window. But for the original LLaMa model, you will have to, um, truncate some of the context to, to have similar, uh, just to have similar latency. So, um, the long-context summar- summarizes is really a, a, a, a demo, um, a demo task that, uh, we can, uh, immediately see the benefit of Refrag. We could get better performance under the same latency because of having this longer context window. And, um, another benefit, um, we, uh, we could see is, um, from the, uh, RAG tasks that we, uh, we, we did fine-tuning. So what we did is that we, uh, collect, um, two of the two million data points that are specifically designed for RAG, um, for RAG. So basically, each data point has, um, uh, one question and then have a few chunks from the system and has its, and has its corresponding answer. And we have two million [items clatter] of them, and we just SFT our, uh, pretrained, uh, our, our continued pretrained, uh, checkpoint on those tasks. And what we have observed is that, um, under the compression rate of six- 16, right? Under the s- compression rate of 16, the... Given the same amount of information to the original LLaMa model and our model, we could almost achieve similar performance in terms of accuracy of the RAG system. And, uh, another, uh, r- result that, uh, I think is pretty impressive is, is that, um, given the same latency for the RAG system, right, our model could consistently outperform the LLaMa model. Because given the same latency, we could put more chunk search result into the context, and that would reduce the risk of missing the information, um, to... that is necessary to answer the question. So we, we actually observed 2%, uh, average performance, um, boosting, um, 255 | 256 | 00:45:18,071 --> 00:45:56,952 [Speaker 1] 257 | over, uh, multiple tasks. Um, definitely check out our, um [items clatter], performance table in, in our paper to, um, to figure out more. But those result, they are, they are kind of, uh, demonstrations to, to show the potential of our work. Um, and I... If, if you really read our paper, I, I would, uh, recommend, uh, you know, uh, to focus on, on the pre-training part because th- those are the part, pre-training and our part, 'cause these are the part that, uh, we did, uh, like rigorous com- co- comparison, um, without those, uh, noise in, in the SFT, uh, tasks. [items clatter] 258 | 259 | 00:45:56,952 --> 00:46:36,011 [Speaker 0] 260 | Yeah, that's amazing. Uh, y- I mean, I think... But maybe if I could just get your quick take on the... There's this sort of, like, lost in the middle context rot, this kind of... You- e- this is a... But this kind of clever architectural solution, do you think there also might be, like, particular training data? Like, 'cause I imagine it's like language modeling on the internet, the pre-training as you're predicting the next token, it's like how often is there informative content, like 20,000 tokens back. Do, do you think it's a training data problem as well or it's just sort of this Refrag clever architectural trick that's the way to solve this? 261 | 262 | 00:46:36,011 --> 00:46:47,231 [Speaker 1] 263 | Yeah. Uh, I definitely think that, um, it's a combination of both. So wh- when it comes to the, uh, performance of the language model, I, I would say that the data is always, you know, the, 264 | 265 | 00:46:47,232 --> 00:47:19,491 [Speaker 1] 266 | the, um... that, uh, one of the most, um... I would say, like, sometimes even fundamental, you know, the data is, is really, uh, something that we, we care about, and especially, you know, what kind of data you want to put in the training and what will be the mixture of those data. And, um, you, you are right, uh, a lot of time, you know, wh- uh, whe- when you think about those long-context problem, right, you, you would have to come up with, um, you- you know, good data, especially, uh, for those data that, uh, have very long context. But, um, 267 | 268 | 00:47:19,491 --> 00:48:10,832 [Speaker 1] 269 | you know, you, you kind of have to have the, uh, f- uh, you know, the data points that is very similar to finding needle in hay, right? And, um, that- that's, um, that's something that, um, I believe, um, is important for model training. And, um... But we didn't do a lot of, um, data ... Um, we, we didn't do a lot of data ablation in our, um, experiment because we, we... in, in this work, we mainly focus on, you know, the model architecture. But I, I would say, like, data is fundamental, and, um, you could do a lot of fun research there, um, in terms of, um, how do you... how do you figure out the best data mixture, and how do you know what kind of data you need to achieve those con- non-context capa- uh, capability of the language model? [items clatter] 270 | 271 | 00:48:10,832 --> 00:48:45,752 [Speaker 0] 272 | Yeah, amazing. Yeah, I think it's, it's super exciting. I, I guess kind of like maybe rounding out some questions, I, I definitely wanted to kind of talk about the time to first token improvement and compare sort of time to first token latency with overall LLM throughput latency. I think it's just such an interesting thing. Like one thing, I think there's Cerebras and Grok are these two companies making these customized chips, and I think the story is that Cerebras is faster time to first token, whereas Grok is faster through put in. And so... And also shout out to Baseten, they have... They're doing a great job with time to first token, I think just with engineering skill and 273 | 274 | 00:48:45,752 --> 00:48:46,832 [Speaker 3] 275 | [laughs] 276 | 277 | 00:48:46,832 --> 00:48:49,551 [Speaker 0] 278 | ... TV use. But, um, could you maybe just sort of explain... 279 | 280 | 00:48:49,756 --> 00:48:59,436 [Speaker 4] 281 | ... how you understand, like just, uh, maybe talk more about this kind of time to first token, first throughput latency with LMs. 282 | 283 | 00:48:59,436 --> 00:49:36,216 [Speaker 1] 284 | Yeah, so, um, actually for time to first token, it's, it's always a, a very, um, important matrix in the, in the production system for different companies. And I talked to a lot of people in the industry, and they, they all, um... They, they are all aware of this matrix and, um, in a lot of companies, they, they think this matrix is, you know, fundamental for the user experience. And, um, so there are a lot of existing work that, um, try to accelerate the LM inference, right? Um, some through condensation, some through, you know, um, 285 | 286 | 00:49:36,216 --> 00:49:56,746 [Speaker 1] 287 | serv- um, engineering the serving of the language model, right? Um, and also, um, some, some work, they try to change the attention mechanism, so that when you do the attention, the, uh, transformer will take, um... would, would, would, um, would scale linearly with respect to the prompt, the Mm. 288 | 289 | 00:49:56,746 --> 00:50:38,156 [Speaker 4] 290 | ... length of the prompt. So, these are, are the, uh, are the very, uh, interesting work to... You know, they, they already try to accelerate the time to first token in a general use case, right? In, in a general, um, um, inference. And we believe that this method, they, they are still useful, um, uh, in a lot of, uh, applications. But, you know, in, in our work, the main focus is that, you know, for, for rep, right? We, we identify those, uh, u- um, unique structure there, and, um, we use those unique structure to further accelerate the, the time to first token, um, latency in, in, in the production system. And, um, 291 | 292 | 00:50:38,156 --> 00:51:15,136 [Speaker 4] 293 | those existing, uh, condensation and serving technique, they, they are kind of, um, orthogonal to our work. They could, uh, readily be applied to our work to further accelerate this time to first token. Um, but w- we didn't... uh, you know, we didn't try, try it, um, like all, all the combination in our work. And, um, I, I think it will be a, a fun, uh, a very fun, um, project to work on, like using all these, um, existing, uh, condensation and serving technique to further boost the time to first, first, first token latency for the refrag. 294 | 295 | 00:51:15,136 --> 00:51:21,456 [Speaker 4] 296 | Um, I, I think that they could be a, a, a, a major, um, performance boost. 297 | 298 | 00:51:21,456 --> 00:51:28,576 [Speaker 4] 299 | If you apply all the technique, it could multiply and, um, adding on top of each other. 300 | 301 | 00:51:28,576 --> 00:51:29,395 [Speaker 1] 302 | Mm. 303 | 304 | 00:51:29,395 --> 00:53:11,856 [Speaker 4] 305 | Yeah, super cool. Yeah, I think like, um... I don't know. I don't want to throw out some kind of number, but I think like t- like maybe like 300 millisecond LM inference time is a, is a... I don't know, like ballpark or just giving, maybe not based off it too much. But, um, yeah, it's really cool to hear that there are sort of also orthogonal benefits of time to first token. I think, um, there... a lot of applications, like if you have a Boolean valued output from the LM or a classification label, it's sort of all about time to first token. And then compared to, you know, writing a long response, now maybe it's like some- somewhere in the middle and just... Yeah, really interesting sort of thing to be looking at. Um, uh, cool. So, uh, I also... I wanted to sort of... As we're running out, I, I, I always think of the podcast with the general sort of future directions question, but, uh, but I want to just, um, ask one more question about the sort of schema of the refrag vectors. And I've already learned a lot from talking to you about, uh, m- some... Maybe sometimes you pre-compute it, sometimes you do compute the chunk vectors on the fly. Um, but I wanted to talk maybe about the sort of granularity of like, um, wha- whether you say retrieve the search results with a chunk and then you... uh, but then what is actually passed the refrag is like the entire blog post. So, sort of like an embed small, like retri- or like re- retrieve small, read big. Or maybe if you have sort of like a Matryoshka representation for refrag, where you have like a K equals 256, but then you also have K equals 16, like nested within the 256. And so maybe just kind of asking about like how you think the future of this, this schema design will evolve with all these different ways you could vectorize the content? Yeah. 306 | 307 | 00:53:11,856 --> 00:55:32,071 [Speaker 1] 308 | Yeah, definitely. Um, I, I think you, um, you, you mentioned a, a few, uh, very interesting direction that, uh, I also want to talk about. So, uh, one ma- main, uh, you know, uh, one main idea that I, I think people could, um, do, um, further exploration is that, uh, if you think o- our, um, our selective, selective policy, right? It's kind of w- uh... it, it kind of works like a, a dynamic, dynamic tokenizer, right? Um, we kind of decide, uh, how many tokens we want to put for certain context, right? Um, so for... So, in our case, you know, for, for chunks with more information, we use more tokens to represent it. But, but for, um, chunk with less information, we use one single token to represent the whole thing. And I certainly think that... Uh, I, I believe that there are better way to do it. Um, and, um, in our case, we, we only do binary, right? But a more... uh, a more ideal case is that, you know, for the whole sentence, we can decide... um, we can decide where to chunk. And, um, think about a, a, a token as a, a bag, right? You have one bag there. And, um, for each token, ideally you want to put, um, a s- similar amount of information there, right? And we could use those dynamic chunker to decide when to s-... when to, to, uh, cut the text off, so that this certain, uh, chunk has, um, a, a, a unique number of information. And, um... and we do a different length of, of chunking to make sure that, um, different... uh, like each, each token they contain this... similar amount of information.... and, um, and we could- we could also do th- this dynamic chunker, um, training it along with the decoder model so that they can, um, do end-to-end train- end-to-end training, uh, and, uh, decide where to, um, where to cut the text more, uh, smartly. So I- I certainly beli- believe that, uh, that- that's a- that's a better way, that's a, a, a perfect way to, to do, um, um, to do the chunking, uh, for the, um, rack system. And, um, I- I- I think, uh, if, um, i- i- it's a very interesting topic to work on 309 | 310 | 00:55:32,071 --> 00:55:34,932 [Speaker 1] 311 | in the future. 312 | 313 | 00:55:34,991 --> 00:56:15,232 [Speaker 0] 314 | Yeah, it's amazing. It c- it- it sounds like sort of the next level to that late chunking idea. I think Jina AI had, uh, came up with it. Um, I'm just kinda struggling to recall the exact details of it. [laughs] I think you, um, I think it's like you embed the whole context and you have the contextual interaction. [laughs] I'll have to cut out this part of the podcast out. But we have a blog post on- on our Weaviate blog, and- and they're- and Jina has their great paper describing this. But, uh, anyways, uh, Yaging, uh, sort of the question I would anchor the podcast with, Refrag is obviously super, super exciting in itself, and maybe this is sort of your answer to the question, but, uh, what- what future directions for AI excite you the most? 315 | 316 | 00:56:15,232 --> 00:57:10,412 [Speaker 1] 317 | Yeah, I- I- I think right now, um, the agentic, um, is really, uh, one direction that I- I feel is- is, um, most exciting for me 'cause, um, you know, that- that's actually one of the reason why I- I wanted to work on this project, 'cause, um, context, uh, management is a very fundamental problem in agentic AI. And, uh, think about in our, um, research, we- we only do- uh, we- we only consider the search agent, right? Um, where, um, the- the e- the environment return the search result, and you would have to, um, do some management in the, um, search result so that, um, it- you wouldn't, uh, um, make the context super long. And, um, I- I believe that they, um, for the- the whole agentic system, there are more, uh, stuff to, um, there are many more stuff to, um, 318 | 319 | 00:57:10,412 --> 00:57:48,812 [Speaker 1] 320 | to work on, um, besides this, um, this- this, uh, specific way of doing, um, context engineering. And, um, for example, you know, uh, our- our Refrag, they- they, um, we could extend this work to, uh, like different, uh, not just search agent. We can do, uh, uh, the- this Refrag compression for, um, multimodal, um, language models so that, uh, it c- it could be applied to those, uh, GUI computer use agent, and, um, and it could also be applied to those, um, um, 321 | 322 | 00:57:48,812 --> 00:58:18,151 [Speaker 1] 323 | you know, the- the agent that take a lot of context information. And, um, as- as I mentioned earlier, you know, uh, for Refrag, um, if you compute on the fly, you still get the, uh, latency improvement, and that means that any, uh, context, right? It doesn't have to be search result. Just- just be any context that is super long, you could use our technique to- to do further accele- acceleration. So I- I certainly believe that, um, there are 324 | 325 | 00:58:18,151 --> 00:58:35,591 [Speaker 1] 326 | many similar, uh, uh, challenges that we could address, um, in- in a ge- agentic AI system. And Refrag is, uh, is, uh, um, is one of my, um, uh, attempts to, um, solve some of the fundament- fundamental problem in- in this system. 327 | 328 | 00:58:38,671 --> 00:59:34,591 [Speaker 1] 329 | Yeah, that's such an amazing idea. I- I give a quick mention to Charles Pierce from Weaviate who had i- introduced this idea to me in, uh, the Substack article he wrote about Refrag that was, uh... So it's the number one on Hacker News for the weekend, [laughs] so that was something that we were really happy about. But, um, it's this idea that, um, that, uh, uh, like, we're reading these vectors, can we also write these vectors? And like you mentioned, not just the search results coming back with these vectors, but just say like an agent is calling the Get Weather API... That's maybe a bad example, but say it returns the weather to you with these Refrag vectors. I mean, weather is a terrible example, but say like even your email API, you're expecting even your email API to come back with all these vectors. And- and yeah, that's- that's really interesting 'cause, yeah, you have like this function calling loop agent, and as it gets to 20 steps of function calling, now it's got this big message history and- and it can't attend over it anymore. Uh, that's so cool. That's- that's a really exciting... 330 | 331 | 00:59:34,591 --> 00:59:34,611 [Speaker 1] 332 | Yeah. 333 | 334 | 00:59:34,611 --> 00:59:59,732 [Speaker 0] 335 | That kind of idea of, um... Yeah. But yeah, like, uh, a- agents sending vectors to other agents is- is like agent as a tool, uh, and, yeah, it's a really awesome one. Well, Yaging, uh, thank you so much for joining the Weaviate podcast. This is really just one of the most fun ones of all time. I've learned so much from this conversation, and it just is... Refrag is so exciting, so much fun to dive into these ideas. Thanks again for joining. 336 | 337 | 00:59:59,732 --> 01:00:00,812 [Speaker 1] 338 | Yeah, thanks for having me. -------------------------------------------------------------------------------- /data/podcast-transcripts/131.txt: -------------------------------------------------------------------------------- 1 | 00:00:00,080 --> 00:00:19,990 [Speaker 0] 2 | Hey, everyone. Thank you so much for watching another episode of the Weaviate Podcast. I'm super excited to welcome Matthew Russo, a PhD student at MIT. Matthew is doing incredible work at the intersection of AI and database systems, including his recent work on SemBench, a benchmark for semantic query processing engines. I'm super excited to dive into these things. Matthew, thanks so much for joining the podcast. 3 | 4 | 00:00:19,990 --> 00:00:22,000 [Speaker 1] 5 | Yeah, thank you for having me. 6 | 7 | 00:00:22,000 --> 00:00:33,060 [Speaker 0] 8 | Awesome. So I think I'd love to kick it off for our audience to just kind of explain this concept of a semantic query processing engine, and maybe introduce semantic operators and just sort of, uh, set the scene for this. 9 | 10 | 00:00:33,060 --> 00:03:33,340 [Speaker 1] 11 | Yeah, totally. Um, so in general, uh, I think the way, the best way that you can think of this pitch is that imagine that you have, you know, your traditional database, um, which has a relational algebra. You use SQL to query it, uh, and that gives you access to a bunch of, uh, operators under the hood such as filters, joins, aggregates, groupbys, and so on. And just simply put, you know, if you try to issue a SQL query over, let's say, a database that has, um, you know, let's say pointers to images of cars. And you say, "Okay, I want a filter for every row in my database that has an image of a red car." Well, currently this is not, like, really supported. Um, you know, your, your traditional relational operator has no way of, uh, computing this filter, the car in the image that's linked by this column's file path, um, is, has the color red. But today with LMS, uh, you know, this is actually suddenly very possible. And, y- in the past there had been some work, um, in the database's community to try and extend sort of the SQL relational model to include these types of, um, you know, image and video-based filters, uh, text-based filters as well by sort of imp- putting these MLUDFs into the database. But what's really changed now with the foundation models that we have both in terms of language models and vision models, is that they have extremely good zero-shot capability, right? So in the past, you kind of had to optimize the UDF for like, you know, you probably had to train a cascade model and tune it and parameterize it so that for that one query about red cars, it would do a really good job at filtering for red cars in a way that was, like, fast and, you know, high in terms of accuracy. But the great thing is that now that we have, uh, you know, these vision models, you could just ask it on one query, "You know, I want a filter for red cars." The next query could be, "You know, I want a filter for Porsche 911s." Uh, and because of sort of their, you know, their general world and semantic understanding, it's possible to just build a system around these foundation and language models that is able to sort of in zero-shot just answer any query that has some of these semantic operators. And so perhaps to just be very explicit, what I mean when we talk about semantic operators is you can imagine that you can specify, um, the where clause in a SQL query, uh, which we would say is a semantic filter, um, using just a natural language sentence, so, "The color of the car is red." You can also specify, uh, join predicates, um, in natural language. So I think in SemBench, one of the workloads, we have, um, audio recordings of animal noises as well as video... Uh, or taken as images capturing where these, uh, animals are by, uh, you know, trap recordings. And you can effectively say, "You know, I want to f- I want to join every instance of audio where that audio is the same animal as in the image." Right? So if you hear, like, the elephant trumpet and you see a ph- a photo of an elephant, you hope that those two, you know- 12 | 13 | 00:03:33,340 --> 00:03:33,470 [Speaker 0] 14 | Mm-hmm 15 | 16 | 00:03:33,470 --> 00:05:46,010 [Speaker 1] 17 | ... each audio pair gets joined together. Um, and then, you know, in some of my past work, um, we've done a lot of work as well with optimizing, uh, semantic maps. So this is something that you... I- it sort of shows up in databases in the sense that you can, like, create new columns on the fly, um, using, you know, t- traditional, uh, relational algebra and o- other operations. But, um, specifically here, for instance, we can have a dataset of, uh, research papers and we can apply a semantic map over the research papers to say, like, you know, "Summarize the paper." Compute sort of like a summary field. Um, and so the list goes on. There's also semantic aggregates, um, you know, semantic groupbys as well where maybe you're coming up with these groups on the fly, like, "What are my customers' top three complaints about some product," right? You need some semantic understanding to figure out what are the complaint categories and then to assign each review of some product to one of those categories. So, you know, so these are these sort of new semantic operators and what's... The main challenge is that when you include these in a database, optimizing for LLM processing cost becomes the bottleneck. Um, you know, suddenly a relational join is not a problem when you think about the fact that doing a semantic join is gonna involve N times M, uh, LLM invocations. And so your cost, your latency is gonna be dominated by that. And also, these semantic operators do not guarantee perfect quality. So now suddenly there's also this trade-off you have to balance between, "Well, maybe I'm gonna try to use smaller models to implement these operators. But if I do that, I might be sacrificing result quality." Um, so yeah, so it's a very interesting space. Uh, and I've worked on, uh, our own, at MIT, our own semantic query processing engine, um, Palimpsest, as well as its optimizer which was a separate paper we wrote called Abacus. And SemBench, uh, is this collaboration led by folks from Cornell, Google, uh, UTM, Bifold to Berlin, um, that is focused on trying to actually build sort of like the TPCH or TPC-DS benchmark for this setting. Uh, and so I played a small part in that, but, um, happy to talk about any and all of the above. 18 | 19 | 00:05:46,010 --> 00:07:17,692 [Speaker 0] 20 | Mm-hmm. Yeah, amazing. There's just so many cool ideas to it. I guess maybe, like, our journey of adding sort of the AI layer to querying databases at Weavia is we have kind of on one end our query engine that does this kinda like text-to-SQL thing, it can write filters and then... But that's one of the biggest innovations, I think, is just natural language to the operators. And then we have something that we call the transformation agent that's kind of like the semantic map where it applies an LLM over all the data in the database and then... But what we do is we then save the value into the database.And so, I think kind of the way that you use these operators is more like a femoral. Like, you're computing it on the fly, but you don't need to save it on the database. But that's just like a little... I just wanted to touch quickly on kind of like where we're at with Weaviate in respect to understanding these things. Um, and yeah, at, at first, um, I was following Liana Pattel's work after, um, she- she had came on the podcast. I met her and we talked about, uh, Acorn and, and so... And that's how I kind of became introduced to Lotus, and that's how I started to see this concept developing. Um, and so I think maybe if we could just kind of talk about the semantic operators. And, like, just... I know that you just gave a really great overview, but if we could just maybe even slow it down. I know in the SEM Bench you present these five operators of, um, uh, sorry, semantic filter, semantic join, uh, semantic classify, semantic map, and semantic rank. [laughs] I think I remembered all five of them in the paper. But, um, so maybe if we could just... I think, uh, uh, we could maybe just go further into those five and maybe explain how they differ. Part- particularly for me, I'm having a hard time understanding how semantic filter, semantic map, and semantic classifier really... They seem kind of similar to me. 21 | 22 | 00:07:17,692 --> 00:07:17,812 [Speaker 1] 23 | Yeah. 24 | 25 | 00:07:17,812 --> 00:07:18,872 [Speaker 0] 26 | Yeah. 27 | 28 | 00:07:18,872 --> 00:12:30,243 [Speaker 1] 29 | I think... So to some extent, I think this was an effort by the paper authors to try to identify what... 'Cause each of these systems has kind of adopted its own language of semantic operators, right? This is very early days. And so we're all trying to figure out, um, what is the space of operators. There's one view, which I guess you could call, like, the, the semantic map maximalist view, which says [laughs] that with a semantic map you can do all of semantic map, semantic filter, semantic classify, and semantic rank, right? Like, and... Well, actually that one is slightly different because it depends how you implement it. If you are taking an entire dataset and then saying, "I want to rank all the items in this dataset," then it's a bit more like an aggregate operation. Um, but if you were to first flatten, you know, things into a ro- like a row with the list, you could in theory use a map to just for that one row rank the items in the list. Um, so I do think that, to your point, a lot of them on the surface as a programmer you might say, "Oh, I could potentially implement both filter, classify, um, or rank using a map operator." I think that the key distinction and what's really important here, and I'll, I'll highlight this especially, uh, for map and filter 'cause I think this is where it's most obvious, is that by explicitly separating, you know, a filter from a map, you actually enable a different set of optimizations under the hood, um, that the semantic query processing engine can go and implement on the user's behalf. And I think that, um, you know, for database folks who are hearing this, this is kind of obvious. For AI folks who maybe don't think about declarative optimization so much, this might be like a little bit of a weird paradigm shift. Uh, and so just t- for like a 30-second overview of declarative optimization, the real pitch is that, you know, uh, in a database system or in a semantic query processing engine, you don't tell the system how it should implement an instruction, right? So, if you say, "I want to apply a semantic map to extract the summary of a research... of a set of research papers." Uh, so s- you know, extracting the summary for each paper in that dataset. You don't say, like, "You should do it using GPT-4.0 mini with this temperature setting." You don't say, "Oh, you should do it by using an ensemble of these three models and then synthesizing their summary into one more concise one." You just say, "I want to do a semantic map. This is the instruction. Go compute a summary." And the system is left to optimize that itself. And so you're putting your faith in the system to be able to take that high-level instruction, to compile it down to some physical execution that ideally is going to give you really high quality, really low cost, and really low latency. Um, and so because of this, uh, for semantic filter in particular, one way that a lot of these systems have optimized these filters, and I know this is true, uh, of Lotus, uh, and I think Thalamus DB, they also use approximate query processing techniques, so they may also be doing something somewhat similar. Um, but in Lotus, uh, what you effectively do is you have, let's say, your gold algorithm, which could just be, like, applying the most expensive model to all of the records in the dataset to compute that filter. And then you can have a proxy model, which is like a much cheaper and faster version of it. Uh, so that proxy model, instead of, let's say, being GPT-5, it could be, like, LLaMA, um, you know, 3.8 billion parameter model. And what they will effectively try to do for that filter operation is that they'll apply the Oracle model, the, the powerful model to a bunch of records. They'll then compute certain thresholds, where it's like, okay, anything above this threshold, um, for the- Sorry. They'll also run the proxy model on the same set. So they get sort of like a sense of, like, the proxy's confidence as well as the Oracle's confidence on how well each, uh, op- uh, record passes the filter. And then based on sort of the re- By looking at the result from the Oracle, and looking at the scores from the proxy, they can kind of set these thresholds where it's like, "Okay, if the proxy says something likely passes the filter with a score above, like, 0.7, let's just say, we're automatically gonna just let that pass the filter. We don't need to run it by the Oracle." If it's below some low threshold, we're gonna say, "Okay, it's automatically not gonna be processed by the filter." Uh, and it's... And then maybe if it's in some small band between the low and high threshold, then we say, "Okay, it's really unclear, so only these small band of records will go to the Oracle." And so in doing that, you effectively trade off potentially a little bit of accuracy by not using the Oracle for every, uh, record. But you save hopefully a lot in terms of processing cost and processing speed. The thing is, is that, um, you can do that for a filter, because it's very easy to assess the binary, like, correctness of the proxy and whether it agrees with the Oracle. Like, both of them said that this record passes the filter, or both of them said that this record does not pass the filter. Uh, with a semantic map where you're computing the summary of a research paper, you can't really do this. I can't take, like, the Oracle summary...... and the proxy summary and come up with some notion of like, "Oh, like, okay, in this setting, I can use that." So that's 30 | 31 | 00:12:30,243 --> 00:12:51,964 [Speaker 1] 32 | ... To tie it all the way back to your question, that is why it's actually kind of important to have this distinction between a map and a filter, because the nuance of those operations allows for a different set of optimizations under the hood. Uh, and there's similar sort of things you can do with ranks and classify operators as well. So I think that that's kind of why we split out, uh, the space, the set of operators in this way. 33 | 34 | 00:12:51,964 --> 00:13:00,804 [Speaker 0] 35 | That's very cool. Yeah, there's so- so many cool things that, I mean, yeah, the kind of declarative programming model is always so exciting. I'm a huge DS Py fan, so I love to kind of- 36 | 37 | 00:13:00,863 --> 00:13:01,214 [Speaker 1] 38 | Yes, yes 39 | 40 | 00:13:01,214 --> 00:13:54,304 [Speaker 0] 41 | ... think about it in that sense. Um, and yeah, uh, so I guess like with the operators, yeah, I think that's a r- a really great explanation of how semantic classify and semantic map, semantic filter differ for the query engine. Um, and then I kind of just... As, you know, listening to you talk is inspiring me thinking about these kind of database query planners, the kind of cost models, as you mentioned, the proxy model and the Oracle model. And so maybe we could introduce this concept and then later come back to semantic joins, because I think that is such an interesting different concept. But, um, this kind of like in relational algebra, say you have two filters, the car is red or the car is a BMW, there's like an order to which filter you should apply first. And so as I, as far as I understand it, that's kind of the general setup, is there's like an optimal ordering to how to apply the filters. And then maybe in Abacus, you have something like a sample 500, start running the filter in order to get the, um, like 42 | 43 | 00:13:54,304 --> 00:13:54,554 [Speaker 2] 44 | [clears throat] 45 | 46 | 00:13:54,554 --> 00:14:03,983 [Speaker 0] 47 | So- so does that even have a sense of what the cardinality of the resulting filters might be. And maybe you just mind explaining that sort of like what a query cost parting model is, and yeah. 48 | 49 | 00:14:03,983 --> 00:17:51,304 [Speaker 1] 50 | Totally. So, um, so I think that this is one of the, one of the strengths of, uh, of our work on Abacus. It's something that we, you know, we emphasize a lot. This is one distinction, I guess, between Lotus and Palimpsest and Abacus, um, which I'll- I'll just refer to that as either Abacus or PZ when talking about my work. Um, the name Palimpsest is kind of a mouthful. Um, but so Lotus uses like the, the data frame processing model. And so actually, it cannot do sort of like the filter reordering, like you mentioned. Um, and that's just because depending on the order in whi- ... Because it's not a lazy execution model, it means that for each line in your program, right? If you have your data frame and you do dataframe.filter, the car is red, and that returns a new dataframe. And then you do dataframe two is equal to dataframe.filter, the car is a BMW. The problem is you have to materialize that first dataframe that filters for the car as red before you can process the line that says the, you know, the dat- the car also is a BMW. And so in Lotus, you can't reorder those filters. Um, but in a system like Palimpsest, and I believe as well like in BigQuery and Thalamus DB, these, um, have a, a lazier, uh, execution model or just your traditional SQL model. So you submit basically the whole query, uh, and then what will happen is that underneath the hood, there is a query optimizer, which will take that query and then apply any set of, at least in Abacus, um, transformation and implementation rules. And so transformation rules include just like logical reorders to the plan. So that could include reordering filters. Um, it also in theory could include reordering joins, although we haven't, uh, written that up- optimization into our system yet. Um, and then it also includes what we call implementation rules. And so that's like saying, "I'm going to execute this filter with, uh, GPT-5," or, "I'm going to execute this filter with Gemini 2.5 Flash." So the implementation rules are kind of where you get your like, you know, uh, map or filter-specific optimization of like which model to use or, "Should I use an ensemble?" Things like that. Uh, and then each of these systems, to some extent, will have a cost model, which tries to effectively provide a prediction. Um, and it can be either, uh, in our case, it's based on sampling. But we try to get some notion for each operator of, uh, you know, if we implement it this way versus that way. What do we expect the cost per input is going to be? What is going to be the latency per input? And at- and at least in our system, on a scale of zero to one, how high quality is this operator going to perform, uh, for this specific map filter, uh, or classify operation? And then, um, basically, the- it's a dynamic programming algorithm that will go through all of the possible configurations of the physical plan, considering possible reorderings of the filters in the logical plan. Uh, and then it basically tries to return the physical plan and execute that one that is optimal for your user objective. And so- and that last part is also one, um, one distinction for our system, is in PZ and Abacus, uh, users can specify if they want to maximize quality, if they want to minimize cost, if they want to minimize latency. And they can also apply constraints to the other dimensions. So you can actually say, "For this query, I want to optimize for quality, but I don't want to spend more than like 10 cents per input record." Uh, and so that kind of gives you a little bit more control as a developer over, like, you know, if you're- if you're prototyping at first, you can just go for like the min-cost optimization until you get your query working. And then, over time, you can try and play around with like maximizing quality, but maybe with some thresholds on cost to keep it more manageable. 51 | 52 | 00:17:51,304 --> 00:17:57,744 [Speaker 0] 53 | Uh, I'd- I'd love to ask about how you think about with Palimpsest, the design of, um, 54 | 55 | 00:17:57,744 --> 00:18:06,224 [Speaker 0] 56 | uh, like... So does this, does this layer live as like a service outside of the database? Would- would this be something that's built directly into the database? 57 | 58 | 00:18:06,224 --> 00:19:41,056 [Speaker 1] 59 | Yeah, I think, um, so we build it as a, uh, a framework in Python. And we made that choice mainly because we were just like, "This is- this is where the AI folks are. This is very AI-driven." Um, at the time that we- we initially published the Palimpsest paper, like, uh, there were not many of these SQPs kind of around. I don't even think this sort of formalization had materialized yet. And so, uh...It was sort of like we wanted to do this outside of the database because we weren't sure whether the... Whether it was really gonna work that well. And it seemed like implementing this in a database was gonna be a lot of overhead and a lot of, um, you know, a lot of software engineering effort. So we implemented it outside so we could kinda have full control over the system, the optimizations. Uh, but I think that the... That all of the ideas in PZ and Abacus can be directly applied to a database. So like, um, you know, you could in theory apply this cost model or some of these optimizations into, like, uh, you know... A- and I mean Google's kind of proof of this in the Sam Bench paper, right? They have BigQuery, which has these semantic operators built into the system. Um, so yeah. So I think that you can... You can definitely... Like, PZ itself I wouldn't say is a database, um, but you can do many of the same operations in PZ. Um, and then if you were working at, you know, a database company, uh, but you wanted to borrow ideas from PZ, you wouldn't use PZ per se itself, but you would probably take... Lift those ideas and then implement them in your, uh, database. 60 | 61 | 00:19:41,056 --> 00:19:48,546 [Speaker 0] 62 | All right. That's super cool. And, uh, yeah, so, um, so I'm sorry if this is a dumb question, but I do- I don't really understand the kinda No, it's not a dumb question. 63 | 64 | 00:19:48,546 --> 00:19:48,786 [Speaker 1] 65 | ... [laughs] 66 | 67 | 00:19:48,786 --> 00:20:17,136 [Speaker 0] 68 | [laughs] But, um, like, um, when you mention, like, the physical plan and the logical plan, I'm curious, like, uh, like, the... There's a physical plan. Like, when you're building palimpsest on top of a database like say Weaviate [laughs] or, or like PostgreSQL or whatever it is, d- can you get some kind of API to see, like, physically where is the data? Or is, is that what's meant by physical plan? Like, there is some... Here's exactly how on disk the data is, and these are the indexes or... Yeah. 69 | 70 | 00:20:17,136 --> 00:20:42,496 [Speaker 1] 71 | Kinda, yeah. It's, uh, it's inspired a lot by what you're saying, which is that the... So the logical plan is sort of just the representation of your query where you just have the types of the operators, right? So you can imagine, um, let's, let's take, uh, as an example, I think we have this diagram in our, uh, in our Abacus paper. Uh, there's a set of research papers, 72 | 73 | 00:20:42,556 --> 00:22:50,676 [Speaker 1] 74 | and you wanna filter for papers that are about databases. And then you wanna use a f- a first semantic map to summarize the main contributions of each paper. Uh, and then you wanna use a second semantic map to classify whether they're relevant to your interests, and, you know, you can specify what your interests are in the map instruction. All right. So there is a logical representation of that plan, uh, which we would say is just... It's a scan, 'cause you're gonna scan over all the research papers. And then it's a filter, so... 'Cause you're gonna apply your, you know, your filter, and then it's two logical maps. And so that logical plan can be optimized in essence by, um, making sure that the filter happens before the maps. You could imagine someone could have written the same program where they did, uh, you know, the two map operations, and then the last step was to filter about... This is about data systems, right? So logical optimizations for the logical plan correspond to things like, "Oh, I'm gonna push the filter down so that we execute that first." Um, it also... It can include, if you're... If you have semantic joins, you know, the order in which we execute these joins depending on, you know, what we think the cardinality of the join might be. Um, and then the physical optimization, in a logical plan, you still don't have this notion of like, "How am I gonna execute the map?" Like, "What model is actually going to do this task?" And so that's what... That's the difference between a logical and a physical plan. A physical plan is just saying like, "Okay, it's the logical plan where the filter comes first." It's a scan filter map map. "And specifically we're gonna do the first filter with like, you know, three... 8 billion parameter model." Uh, and then the second map with GPT-4o mini, and the final map is gonna be like some ensemble, um, that uses these three models. Uh, so yeah, so that's, that's the key distinction. Um, but to your point, in traditional databases, a lot of that difference between logical and physical is that, um, the physical plan depends a lot on understanding like where is data laid out. That's gonna tell you things like, uh... And like, "What indices do I have available to me?" That can give you a sense of like, "Okay, what's the cheapest scan option for this query?" So you're, you're definitely barking up the right tree. 75 | 76 | 00:22:50,676 --> 00:23:18,426 [Speaker 0] 77 | [laughs] Awesome. Yeah, well, it's super interesting. It... Yeah, it's such a cool topic. Um, I guess, I guess maybe just kind of before diving a little further into the systems and how particularly they optimize them and, you know, design these f- uh, especially talking about the selecting the models, I'm really excited to dive into that. But I also just maybe wanna set the stage for our listeners more on, uh, semantic joins. I'd seen this article really early on, like a... Here's, like, maybe an application of vector searching would be like [laughs] if you like dating, and so you'd have- 78 | 79 | 00:23:18,426 --> 00:23:18,426 [Speaker 1] 80 | Yeah 81 | 82 | 00:23:18,426 --> 00:23:19,036 [Speaker 0] 83 | ... like the two 84 | 85 | 00:23:19,036 --> 00:23:19,456 [Speaker 1] 86 | [laughs] 87 | 88 | 00:23:19,456 --> 00:23:24,296 [Speaker 0] 89 | And you'd be... Yeah. Yeah. Do you mind, uh, maybe just telling the story of semantic joins? 90 | 91 | 00:23:24,296 --> 00:28:40,132 [Speaker 1] 92 | Yeah. Um, so semantic joins, I think there's a lot of, of interest in them, um, and excitement around them, in part because in traditional databases, uh, optimizing joins has been, uh, extremely important for getting good query performance. And, um, and I think that the... So I, I'm actually gonna get... Maybe I'm gonna be a bit of a letdown here. I'm gonna give you [laughs] a bit of a... Like, a pro and a con here. So the pro is that the same should be true in this... Uh, in the setting for, uh, semantic operators where op- you know, if you have a query with a semantic join, um, that is going to dominate the cost of the query. Uh, and like actually in the revision for Abacus, we just had to rewrite one of our, uh, programs to have three semantic joins. And one of these semantic joins involves like 12,000 images. And so the query costs like anywhere from $2 to $27 depending on how you physically implement, you know, the matching of images to text strings. Um, and it just shows you like how important it is to get... If you're trying to minimize cost, like use the cheaper embedding-based approach to join these things, 'cause like if you run a vision model on 12,000 images, you're gonna spend-... quite a lot of money. Um, so I think that from an academic perspective, um, there's a lot of excitement around, how do we optimize these joins? You can think of, uh, potentially another technique as being like, let's just say you wanted to, uh, join two different f- pairs of text. So maybe it's like movie titles and then descriptions of the movie. Like, do you know? Is this... Or, or genres perhaps. So for each movie you want to match, does it, uh, correspond to a certain genre? You don't necessarily have to, um, you know, take, let's say you have five movies and five, uh, genres. You could process them one by one. So you could take the first movie and the first genre and ask, or, you know, does the genre match the movie, true or false? And you could kind of then go down to the next genre and so on. Alternatively, you could also just take all five of them, feed them into the LLM as a batch, and then ask it to come up with a very compressed output that tells you only the movies that joined a certain genres. And so you would save on the number of LLM calls, which hopefully reduces your latency. Um, and also you could potentially save on some processing costs if you're clever about how you, uh, elicit the output from the model. Um, so there's... So for these reasons, like there's a lot of excitement around, you know, we've had all these ideas and we've spent a lot of time optimizing joins in relational databases. Clearly there's a lot of work and research to be done optimizing these joins for, uh, the semantic setting. So that's my, that's my bull, like my bull case. My bear case for semantic joins is that it's actually kind of hard to come up with... Um, it's not impossible, but it, it, it actually takes racking your brain a little bit sometimes to come up with interesting examples of semantic joins that are extremely relevant, um, for data processing workloads. Um, like, a- a- and maybe it's less true for text to text, um, but like for image to text even, um, it can be slightly difficult. Like, you might think of like, "Oh, does this caption match the image?" Okay, that makes sense, right? Yeah, like does this caption describe what's happening in the image? Totally get it. But like, what's an example of a setting where I have a bunch of captions and a bunch of images, but they're separated and I need to bring them together? Like, u- usually the two come together already. So, um, so I would say that... And I'll probably like be proven wrong in one year and there'll be like a lot of egg on my face for this take, and people will be like showing tons of examples and like, "Here's an obvious semantic join." Um, but at least that's one thing where sometimes when I, when I tell folks about this, one feedback that I have gotten from, uh, like non-AI experts in the past has been sort of like, "Okay, but like when am I gonna actually do that?" Um, or, "Why wouldn't I just, you know, apply a filter on both sides, extract some distinct field and then do a relational join?" And that is potentially an optimization also that's worth looking into, is like, uh, and, and that you could implement as an optimization for a semantic join in a system, would be to say like, "We actually do a, a filter and a map on both sides." Um, or sorry, "Do a map on both sides to extract like what animal is in this image, what animal is in this audio snippet, and then just join on those, uh, like those animals in a relational fashion." There are some downsides to it. We actually did kind of try this, um, for the animal image query. The problem is that sometimes on the map, um, if you don't like distinctly say like these are the classes for the animals, you can get one that's like, "This is an image of an elephant," and then you can get an audio recording like, "That's the audio of a North African elephant." And then when you try to join elephant to North African elephant, it doesn't join, if you do it relation- in a relational fashion. Um, so yeah. So it's a tricky, it's a tricky subject for sure, but I think it's one that's very exciting. Um, and I'm sure in a few years' time we'll have tons of use cases and I'll look like a fool for, uh, my skepticism, but yeah. 93 | 94 | 00:28:40,132 --> 00:28:59,122 [Speaker 0] 95 | Mm-hmm. And it starts me to think of one of those GitHub repositories, it's like awesome semantic joins, and it's just like a collection of like thousands of queries. Like there's like... I could imagine like the, um, the job applications to resumes one is like a good motivating example. I think, I do think the dating one is pretty good. We do have like a customer at Weaviate [laughs] that uses- 96 | 97 | 00:28:59,122 --> 00:28:59,132 [Speaker 1] 98 | Yeah 99 | 100 | 00:28:59,132 --> 00:29:02,212 [Speaker 0] 101 | ... vector databases for that kind of thing. Um, but... [laughs] 102 | 103 | 00:29:02,212 --> 00:29:50,812 [Speaker 1] 104 | I think, I think it's, I think it's definitely, to your point, um, text to text, I think there's a lot of examples. Um, another one I s- I've heard about is there's a, a company that does, um, uh, sort of, sort of like Mechanical Turk, right? Like people submit tasks and then there are sort of workers, uh, with certain skills. And so they're actually trying to use, um, from what I understand, AI to try and match, you know, qualified, uh, people to perform given, uh, freelance tasks. Uh, and so, but again, the multimodality aspect I think is the, where it's kind of hard. It's like where do images, where do audio come into this? And so it may end up being the case, but even still, like if you can... If you have a lot of text things you need to join, it's good if you can do it in an optimized fashion. 105 | 106 | 00:29:50,812 --> 00:29:58,051 [Speaker 0] 107 | Yeah, that starts me to maybe think of joining like images with like music. Uh, yeah, that's cool. [laughs] 108 | 109 | 00:29:58,052 --> 00:29:59,012 [Speaker 1] 110 | Yeah, it's, it's- 111 | 112 | 00:29:59,012 --> 00:29:59,172 [Speaker 0] 113 | [laughs] 114 | 115 | 00:29:59,172 --> 00:30:05,042 [Speaker 1] 116 | Well, you see, it's a little tougher to come up with one and like convince yourself like, "Ah, yes, this is, this is, uh, needed." 117 | 118 | 00:30:05,042 --> 00:30:05,051 [Speaker 0] 119 | [laughs] 120 | 121 | 00:30:05,052 --> 00:30:15,532 [Speaker 1] 122 | But I'm sure that there are use cases. I just, you know, it's hard when you're in your bubble in academia sometimes to think of these things. But there are definitely folks in the industry who are like, "What are you talking about? Like I have to do this every day." 123 | 124 | 00:30:15,532 --> 00:30:40,332 [Speaker 0] 125 | [laughs] Yeah, it, um, it really... I mean, I, I love kind of setting the scene with like the declarative programming model and it's like how particularly we implement the semantic joins is indifferent from just defining it as one of the operators. But it makes me think like, it could be like either extreme m- you know, extreme multi-label classification to try to map these two things to each other. Like if I have 100, uh, unique categories of the one thing I'm trying to join with the other, it's like that problem. 126 | 127 | 00:30:40,332 --> 00:30:40,832 [Speaker 1] 128 | Right. 129 | 130 | 00:30:41,056 --> 00:30:59,456 [Speaker 0] 131 | And then, uh, I really love, uh, topic modeling, uh, t- topic modeling where you're like clustering the vectors. It, it... You represent which object in the vector is, and then you cluster them, and then you summarize the clusters. And maybe that has something, 'cause you're kinda like com- coming up with the categories on the fly. I don't know if that's quite a semantic join. I think that's like a sum up, yeah. 132 | 133 | 00:30:59,456 --> 00:31:02,216 [Speaker 1] 134 | I think you're... Like a semantic group by potentially could be- 135 | 136 | 00:31:02,216 --> 00:31:02,386 [Speaker 0] 137 | Yeah 138 | 139 | 00:31:02,386 --> 00:32:00,456 [Speaker 1] 140 | ... um, could be an instance of that. And the cool thing there is that actually what you just described would be a semantic group by, where the aggregation is also semantic. So you're, you're clustering the groups and then you're saying, "Okay, what does this group represent?" Like come up with a s- semantic description of it. Um, and I think that that is actually a very, very interesting problem because a lot of traditional, like, clustering work, um, you know, that's kind of what you're trying to do. It's like, if you have a bunch of unstructured data and you're just embedding it, clustering it, and then hoping that these clusters mean something. Um, in theory, there's a world in which we have... You just say, "I wanna do my semantic group by," uh, and the system can optimize it, and then you just can, you know, ideally have some levers of optimization to try and maybe get those clusters and groups to align better with what you are expecting in the data. Um, so it, in theory, could make that whole, like, embedding based clustering problem a lot more streamlined. 141 | 142 | 00:32:00,516 --> 00:32:56,466 [Speaker 0] 143 | Th- that... And also, like, out of opening the podcast I mentioned that at Weaviate we have this thing called Transformation Agent that's kinda like semantic map or semantic classify, where we save the attributes back into database, into the database. And something we saw people do a lot is they'd run the Transformation Agent and then they'd use the Query Agent to ask questions about this new property. Like, if you're saying the car is red and you didn't already have that stored. And so then with this kind of semantic group by, I was thinking like about, um, like Databricks has this catalo- uh, they call it a Unity Catalog, sorry, where it's like about cataloging data, and I thought that could be a really great, like, interface for this semantic group by, as like kinda like a catalog agent. In Weaviate, it's like something that would group your data, give you a nice way to visualize these groups. Um, yeah. Anyways, so it just... So cool. Um, I... So I'd love to... I think this will transition well into talking about the dataset 'cause we kinda talked about, like, these use cases, examples of semantic joins. I think kinda maybe... Yeah, if we could now kinda transition into talking about what SEMBench is and how you're trying to benchmark this. 144 | 145 | 00:32:56,466 --> 00:36:49,736 [Speaker 1] 146 | Yeah. Um, so I think that, uh, at a, at a high level, the goal for SEMBench is to kind of create a parallel to your TPC-H or TPC-DS, um, uh, benchmark, which for analytical query processing has kind of been like the gold standard for testing out, uh, different databases, different query engines, query optimizations. And so the key... But the key problem with, uh, those benchmarks is just that, um, they don't have, you know, text or like large text fields or images or audio. And so in this new domain where we have semantic operators and where cost, latency, and now variable quality are determined by the LLMs you use and how you apply them, um, it really requires a new benchmark so that we can test out systems and get a sense of, you know, um, e- Which systems perform well at what operators? Um, are there certain types of queries that are difficult across all systems or wherever? Do we see high variance? Uh, what optimizations work well for this setting? So, I think that the, the initial SEMBench paper, um, again, very large collaboration, in, of which I played a small part. So I, I wanna, you know, give credit to all the other authors on the paper. Uh, and I don't wanna speak for all of them, um, but just at least my perspective on the benchmark. Uh, and I think that it's... You know, it's a great addition to the space. We've already been able to benchmark, uh, Lotus, ThalamusDB, Palimpsest, and BigQuery, uh, in this setting, and create a leaderboard that's public so people can kind of investigate the results and, and play around with these different systems. And I think that, you know, there's probably a future where we're gonna continue to grow and expand the benchmark as well. Um, 'cause, you know, right now it has 55 queries across five different workloads. And so those, uh, workloads, we call them, like, movie, wildlife, e-commerce, MMQA, and medical. Um, and they span effectively, uh, queries over four data modalities: tabular data, text data, image, and audio. And they include all of the semantic operators that you just, uh, described. Not... Uh, They're not, uh, semantic group by and semantic aggregate, but also that's something that I think maybe in the future we would, we would look to include, uh, in the benchmark. Uh, just my perspective. Again, not speaking for everyone else. Uh, so yeah. So I think that the, the benchmark is great because for a while, um, you know, when we were writing both the Palimpsest and the Abacus paper, we were kind of... You kind of had to just like find datasets that had been used in other AI settings and papers and workloads and like... And usually we couldn't even use the task as it was described, so you had to kind of change it a little bit to make it fit really nicely into like, "Oh, this is actually a filter with a map, uh, and there's a..." And then maybe you can add a second filter somehow, so now we have to be good at figuring out the right ordering of filters. Um, or maybe I can represent this as a semantic join, and so now join reordering becomes a problem. Um, but there just, like, wasn't a clear and clean benchmark upon which to say, "Okay, we're all gonna evaluate on this. We're gonna have some agreement about what are the settings and what are the parameters for the different systems under evaluation, and get sort of a very clear understanding of which systems are good at, you know, good at which operations and, uh, how do they perform relative to one another." Um, and, you know, just... But I think that honestly, the relative performance is not- not that important, um, because again, it's very early. Um, these systems are still very sensitive also to prompts, um, and so you can get very large performance changes just based on changing the prompt, which, um, you know, ideally in the future you would normalize for that. Um, but yeah, I think it's, it's a step in the right direction, um, to... And it's hopefully the type of thing where when you build the right benchmark, a whole community can build up around that, uh, and, and sort of drive performance and drive research into this area. 147 | 148 | 00:36:51,086 --> 00:37:18,645 [Speaker 0] 149 | Yeah, it's so cool. I think like database benchmarks is such like an evolving area. Like, um, as I mentioned, we're building this kinda like text-to-SQL thing. Um, we saw these datasets like Bird, Spider, uh, WikiSQL, but, but then we're like, "Well, we wanna add a search, uh, query operator to it," 'cause they don't include that in the SQL language. I can imagine adding all these semantic operators. And yeah, so, so, um, so, so you designed this dataset using Kaggle. Is that right? Uh, like, how do you... yeah. 150 | 151 | 00:37:18,646 --> 00:39:18,306 [Speaker 1] 152 | Yeah. So I think a lot of the, um... So for my part, I, I didn't, uh, do a lot of the, the query and benchmark creation. I was mostly responsible for just making sure that like the PZ queries were implemented correctly, helping support with bugs that came up in that sense. Um, but I do... My understanding is that for most of the scenarios, uh, what happened was they took publicly available Kaggle benchmarks, or Kaggle datasets, and then what they did was they created sort of 10 queries over those datasets. Um, so I think the animal one is actually a really, uh, useful example. Um, so there, I f- I actually think it was originally two separate datasets. If I'm... I... Don't quote me on this. Um, but I believe it was two separate datasets, one that had animal recordings and one that had images of animals, but they both had labels of which animal was in the recording or the image. And so what we were able to do is basically to construct a dataset where we said, "Okay, you know, we know the ground truth for what this recording should be in terms of the animal. We know the ground truth of, you know, what the image should be." And then we've also generated some tabular data about like the location where... And, and this was like ran- you know, generated at random somewhat. Like, the location of the image and the location of the recording. But then what that enables you to do is you can suddenly construct SQL queries that are like, um, you know, like, "Which location has the highest number of elephant images?" Um, and so that, you know, requires filtering over the images, but it also requires implementing an aggregation, a max operator over the output. And so, um, and so, yeah. So we were able to effectively construct these queries by s- taking existing Kaggle datasets. And the key reason we used Kaggle datasets was because they already had ground truth labels. So we have ground truth for all of the queries in the benchmark. Um, and that allows us to sort of fairly accurately assess, you know, what is the quality of a given operator. Uh, I think 153 | 154 | 00:39:18,306 --> 00:40:03,266 [Speaker 1] 155 | one limitation of this approach, or something that we'll have to explore in the future is, for something like a semantic map where you're trying to compute a summary, again, it would be very di- And maybe the answer is just that we don't do that in the benchmark. Like, we... If we do extraction of information, it has to be like very, um, well-defined and in a rigid sort of field. Um, so, you know, extracting like the authors of a paper. Like, that's... You know, we can very easily assess whether you did that correctly or not. Um, but I do think that there are... You know, anytime you're using LLMs, evaluating accuracy and quality is, is a challenge. Um, and in other settings, we've seen LLM judges be used, but, um, it... That's... You know, it remains to be seen what we might do for 156 | 157 | 00:40:03,266 --> 00:40:10,666 [Speaker 1] 158 | semantic operations that have less tangible outputs, something like a summary where it's harder to quantify. 159 | 160 | 00:40:10,666 --> 00:41:00,746 [Speaker 0] 161 | Yeah. M- mentioning adding the synthetic tabular data about the locations really reminds me of like, um... I mentioned kind of, uh, how I met Leon and sort of like just through the path of all these things, we came aware of the semantic operators. But, um, we were, we were researching, uh, filtered vector search, like vector searching with filters, and, and there's not a good dataset that, like, has the filters. So what we were doing is we would just, uh, do like mod 10 as we're inserting the data into the, into Weaviate, and that made... that's your filter, like one, two, three. And then, and then what we found is that there's actually... It, it makes sense in hindsight, 'cause like the feature, the filter, and then the vectors, the vector embedding correlation. But we found that there's like a correlation between the filter and the vector embedding that impacts the filtered vector. So, so it's just like you're trying to construct these benchmarks that like combine unstructured and structured. It's like I think created its own problem. Yeah. 162 | 163 | 00:41:00,746 --> 00:41:38,445 [Speaker 1] 164 | Yeah. It's very... It's, it's very time-consuming. Um, it's challenging work, um, but it needs to be done. And so I think that for people to, you know, public- Like, I think that publishing benchmark papers is a great, um, you know, a great direction for communities right now. Because like you said, even for a sort of hybrid vector search, um, we need to have really good authoritative... Not... You know, it shouldn't be the end-all, be-all, but you definitely do wanna have some sort of benchmark that everyone can kind of evaluate and get a sense of like, "Well, how well does my system perform? Where do I need to invest more energy and time, uh, to make my system better?" So yeah, I, I hope... totally agree. 165 | 166 | 00:41:38,446 --> 00:42:14,546 [Speaker 0] 167 | Yes. W- And, uh, so I think kind of this is re- You mentioned uh, kinda like the TPC benchmark, and this is something I really wanted to get your opinion on, is, um, the sort of interaction between semantic operators and then like difficult relational operators. So like, and kinda earlier I mentioned asking about like Palimpsest being built on top of other databases, and I'm cur- like, curious like, say, uh, like ClickHouse as a database is optimized for these like really fast analytical queries. And so I'm kinda... 'Cause like the, the... And you mentioned separating the logical from the physical, but like, certain databases implement certain non-semantic operators faster. How does that, how does that kind of interaction... huh? 168 | 169 | 00:42:14,546 --> 00:42:24,705 [Speaker 1] 170 | Yeah. Um, so I think that, um, the... Truthfully right now, for most of the systems [clears throat] and for most queries, 171 | 172 | 00:42:24,706 --> 00:42:30,016 [Speaker 1] 173 | you're... If all you do is focus on optimizing the semantic part, you're gonna do great. [laughs] 174 | 175 | 00:42:30,016 --> 00:42:30,026 [Speaker 0] 176 | [laughs] 177 | 178 | 00:42:30,026 --> 00:43:16,690 [Speaker 1] 179 | Like, like such a... Because the order of magnitude difference in processing, uh, time and cost is so, so large, even in a hybrid query where you do have a mix of relational and semantic operators, um, if all you really... If you fo- If your optimizer primarily focuses on just the semantic portion, uh, then you definitely can win on a lot of queries. That being said, you are 100% right. It is not impossible to construct datasets, uh-... with, let's say, millions of records, where potentially if you do the right thing on the relational side, you actually then only have to process very, very few semantic, uh, operators. And so you actually perhaps the, you know, the amount of compute that goes into doing some relational join or relational filter 180 | 181 | 00:43:16,690 --> 00:44:18,430 [Speaker 1] 182 | is so high that, um, it, it actually requires a, you know, intelligent optimization on the database side as well. Um, and so I think that that, um... I don't have a good answer for you right now about, like, what do the different systems do for this? I think most of them... Um, BigQuery, I would assume would probably be the most ahead right now because they, you know, they've optimized for relational queries for a very long time. Um, and so they probably still have access to those optimizations even when they're using, um, you know, their AI-enabled opera- semantic operators. Um, but yeah. In, in Palimpsest at least I can say we, you know, we don't do anything special for the relational, um, portion. Uh, but in the future if we have... I, one thing that I know we've talked about is adding indices into the system, uh, both in a semantic and non-semantic variety. So in theory if you had an index over some relational data, um, you know, we, we could leverage that instead of just doing some line scan or something like that. Um, yeah. 183 | 184 | 00:44:20,210 --> 00:44:23,190 [Speaker 1] 185 | Yeah. I've heard of... I guess, like, 186 | 187 | 00:44:23,190 --> 00:44:38,750 [Speaker 0] 188 | quickly talking about that, like, um, I think something we saw with, like, Weaviate is people would build a lot of rag systems where yeah, the vector search it might be only 10 milliseconds, and that's great, but the LLM is five seconds. So [laughs] it's like you saved me 90 milliseconds, but [laughs] yeah. 189 | 190 | 00:44:38,750 --> 00:44:48,390 [Speaker 1] 191 | Yeah. Yeah, that's, that's, that's just the nature of it. And I think, um... So one thing that I also am somewhat bullish on is that 192 | 193 | 00:44:48,390 --> 00:46:31,290 [Speaker 1] 194 | the processing cost and times are coming down. You have, obviously in the news, massive AI data centers being built, possibly propping up the entire US economy at this moment in time. Um, so this is a massive level of investment. And I think that everyone is hoping that, uh, three to four years from now, uh, you know, compute is gonna be so widely available that it may not be crazy to say that if I have a semantic filter, I could run 10,000 LLM operations in parallel. Right? So, you know, right now you're somewhat limited by, like, the API rate limit and how many, uh, you know, how many, you know, filter tasks you can fire off to some service provider. Or if your system is managing models, uh, on local hardware, right, it's like how many GPUs do you have? If the supply of GPUs goes up in both of those settings, then ideally we can parallelize these requests a lot more. Uh, and then so in that sense, um, the latency on the semantic side would hopefully come down, um, quite a bit. Uh, so, you know, again in this very straw man example of, like, I have 10,000 filters I want to do. If I have 10,000 invocations that I can do in parallel, well suddenly now at the time of this, it's the time for one, like, the max length LLM call in that batch. Um, and perhaps then, you know, your relational operators become much closer in terms of scale ordered magnitude in terms of time. So, I think definitely in the not too distant future you could see these things start to approach one another. Um, but right now, um, you know, GPUs are still somewhat scarce and so that, that dominates the bottleneck in computing, uh, these queries. 195 | 196 | 00:46:31,350 --> 00:46:37,820 [Speaker 0] 197 | Yeah, I think that's such a cool one. Uh, like our echo... I don't mean to be plugging Weaviate things like every conversation [laughs] 198 | 199 | 00:46:37,820 --> 00:46:38,960 [Speaker 1] 200 | No, no. 'Cause I like it. It's great. 201 | 202 | 00:46:38,960 --> 00:47:03,350 [Speaker 0] 203 | Just like what my intuition is taking me. But like we do that. Uh, and I'll give a shout-out to Modal. We use Modal as like really great zero to, I don't know about infinity, but like, you know, zero to tons of GPUs. And then that's how we do that, like, batch updating. And yeah, that, that kind of thing. Yeah. Uh, definitely a big to- I think like kind of like co-locating data in G- in compute is a very interesting topic with this. Um, but- 204 | 205 | 00:47:03,350 --> 00:47:47,590 [Speaker 1] 206 | I would... Yeah. Given infinite time, I would love to spend, um... So Palimpsest right now and Abacus, we mostly just support... We have very, very basic support for VLLM. So if you want to run, uh, you know, like a local model on some server. Um, but effectively all of the optimizations are not leveraging. Like, the system itself is controlling the model weights. And so we can do some fancy tricks there. That's 100% a great direction for future research, um, but something that is not currently in Palimpsest. Uh, but I think that there's a ton of, uh, room to explore how you could optimize in that setting. Um, but yeah. We... So yeah, we mostly focus kind of, uh, just on the API-based, uh, processing. 207 | 208 | 00:47:47,590 --> 00:47:55,020 [Speaker 0] 209 | Yeah. 'Cause you could have like a five-millisecond database index, but then a 50-millisecond network latency. So it's like... [laughs] 210 | 211 | 00:47:55,020 --> 00:47:55,030 [Speaker 1] 212 | Yeah. [laughs] 213 | 214 | 00:47:55,030 --> 00:48:19,610 [Speaker 0] 215 | And then you've got to co-locate the pods and the zones and [laughs] you know, it's... Uh, so, so yeah. So I really wanted to just sort of maybe anchor with, um... I, I don't know too much about the systems and the optimizations behind them, but maybe if you could just explain Palimpsest further, kind of like the, the landscape between Lotus and, uh, the Element DB and BigQuery again. And then just sort of like what exciting directions are sort of on the roadmap for Palimpsest? 216 | 217 | 00:48:19,610 --> 00:51:22,018 [Speaker 1] 218 | Yeah, totally. Um, so on the Palimpsest side of things, um, as I mentioned, you know, we support, uh, most of the s- most of the semantic operators, um, all the ones in SemBench, um, as well as a few others. And the primary means of optimization, uh, are that we... For maps and filters we sort of have four different optimizations. The first one is model selection, so which model am I gonna use to implement this map or filter? The second one is, uh, we call it mixture of agents. It's effectively an ensemble. Um, so you ask a bunch of what we call proposers to generate, you know, the map output or the filter decision. And then there is an aggregator which takes all of the proposer's outputs, and then synthesizes them into sort of one final, uh, either, you know, filter decision or, uh, map output.Uh, there is a critique-and-refine operator where an initial model proposes an answer, a second model then critiques the answer, and a third one produces the refined output. Um, and then the final optimization we have is sort of, we call it like context reduction. And so for that one, the simplest example is like if you wanna compute a summary of a paper's main contributions. You don't need to read all 12 pages of the paper. You can usually just read the introduction, and you should ideally get, you know, if the pa- if the paper authors did their job, from just the introduction, you can get a summary of the main contributions. And so in the context-reduction optimization, what we will do for filters and maps is, we'll actually first chunk the input, embed each chunk, take the embedding similarity with the map instruction or the filter instruction, and then take some top-K number of chunks or embeddings and only filter the map... apply the mapper filter using just those chunks. And we'll use like dot-dot-dot to connect the chunks. Um, and so out of everything I just described, model selection offers you the... offers Palimpsest and Abacus a way to explore the trade-off between latency, cost, and quality just on that thin fra- Pareto frontier of like, you know, which model you decide to use. The mixture of agents and the critique-and-refine are pretty much exclusively relative to model selection going to increase your cost, right? So now instead of using one model, we're using three in the critique-and-refine, and using some number of proposers and an aggregator in the ensemble. Uh, but what we do observe is that for large document processing tasks, uh, you do get better performance as well. Um, so in our Abacus paper, um, there is a task around like extracting, um, patients' reactions to drugs from these medical reports that describe, you know, a patient took XYZ drug and then they had this adverse reaction. Um, and so in that setting as well as in this other legal contract understanding, uh, benchmark called Quad, if you use these ensembles or these critique-and-refine loops, um, you can actually get higher quality. Um, again though at the trade-off of higher cost and higher latency. 219 | 220 | 00:51:22,018 --> 00:54:30,658 [Speaker 1] 221 | And then the context-reduction one is primarily geared towards saving you on cost and on latency. So if you have a large input document, um, by chunking it and only processing your map or filter with the most relevant chunks, we can potentially save a significant amount of input token processing, um, you know, by ignoring most of the irrelevant portions of the document. So that's on the filter and map side. Uh, we have a top-K operator which is sort of, um, like we effectively will retrieve the top K objects from some vector database, um, based on their relevance to some, uh, some natural language, uh, texturing. Uh, and so we can optimize for the value K of how many objects do you want returned. So effectively you can think of it as like if you have some pipeline that's doing some sort of RAG operation, we can potentially optimize the value K for what you return. And, you know, ideally small K is gonna give you obviously fewer items, um, but also could avoid cluttering the context. So it's, it's a little less clean in terms of like is it saving you on cost? Is it improving on quality? Um, but the nice thing is that we at least give the optimizer the ability to try and figure out like what's the optimal value of K for your task and your workload. Um, and then the final one is semantic joins. And so we offer a, uh, the naive nested-loops join, which is just take every pair of items and run an LLM. Um, but we also have an embedding-based approach, uh, in which we compute a embedding for every item, and then we use aga- like an oracle model on some subset of the data to come up with that high and low threshold. And then for the rest of the query, you know, anything where the embedding pair similarity is above the high threshold, we automatically join it. Below the low threshold, we filter it out. And if it's in that middle band, then we still use the oracle. Um, yeah, so that's like sort of the overview landscape of optimizations in Palimpsest and Abacus. And the way that the system actually determines what... which optimizations to apply is that the... there's like sort of a fixed-sample budget which can be specified in dollar cost now. So you can say like, "I'll spend f- f- you know, a dollar on LLM calls to try out a bunch of these different techniques." Uh, and we have some like clever algorithms around trying to pick promising operators and early, and then continue to sample those operators to get better and better estimates about their performance. Um, and then once the sampling budget is exhausted, then we do that sort of, as I was saying, like that logical-to-physical plan construction, generating a very large base of physical plans using all the operators that we sampled, and all the different approaches to implementing the map, the filter, the join, top K, uh, and then returning sort of the Pareto-optimal plan for the user's objective. So that's, um... And usually when I say that all, there's like a very nice slide behind me that I can like click through and point at. Um, so I don't know how well that comes across if I just say it out loud. Um, but feel... Like if you have any questions about that, please ask me so I can... I can help clarify. 222 | 223 | 00:54:30,658 --> 00:54:36,138 [Speaker 0] 224 | [laughs] Yeah, it's, it's so cool. So many exciting ideas are just, yeah, hard to even pick a question- 225 | 226 | 00:54:36,138 --> 00:54:36,188 [Speaker 1] 227 | Of course 228 | 229 | 00:54:36,188 --> 00:54:57,968 [Speaker 0] 230 | ... 'cause there's so many cool things to explore. But, uh, I just... I guess I had a quick question about like sort of the implementation of semantic, uh, filtering or classify like, is that like... it could just be an LLM inference or it could itself be a RAG inference. Or, or I guess it could be like an agentic inference where you use like a function calling loop and it's got tools to implement like each of the calls? Um, is that... 231 | 232 | 00:54:57,968 --> 00:54:57,977 [Speaker 1] 233 | Yeah. 234 | 235 | 00:54:57,978 --> 00:54:59,238 [Speaker 0] 236 | Is that what... Yeah. 237 | 238 | 00:54:59,238 --> 00:58:48,786 [Speaker 1] 239 | Yeah, I... we... I mean the spa- like the s- potential space of... and I think part of what... so part of what drove our interest in this direction that we took for Abacus was that we imagined that the space of optimizations is...... is cra- is insanely large. Um, and some of the other s- semantic query processing engines, uh, again... And this is like... This is our difference because this is sort of our very biased view of the way, of the way that these engines should work. So naturally, this means that our system... Like, this is the line on which they differ from others. Um, in a lot of other systems, there's sort of this bias put on the, the... in terms of the people who wrote the system where they say like, there's w- a certain goal, which is like, "We're gonna try to preserve the quality of some model, but potentially save cost by using like a proxy to process some amount of the data." But what that really kind of restricts the system too is like, it can only optimize for saving on cost and quality relative to some oracle. And if that oracle is not great, then you're kind of... Like, there's not really much that you can do in tur- in the system internally to optimize and make the query better. Um, and one thing that we wanted in Abacus was to admit as the largest spaces possible of these types of optimizations, and provide effectively an interface in which developers can write new optimizations. And by design, our optimizer treats each optimization as a black box. That's why it has to sort of sample them to get an estimate of how well they perform. Um, but because of this, basically what that means is that as long as you... Like, let's say that you decide that we should try doing semantic filtering, um, by first doing RAG to look up... Like, embed everything that you're gonna filter over. Use RAG to get the top K equal to 100, and maybe K is a hyperparameter so it can be scaled, and then you only process like those K items with the LLM. And that optimization, again, the lever is that value of K. So like if K is one, it's a super cheap operation, but it probably misses a lot of important things. If K is the entire dataset, well then you're just running an LLM on everything at the end of the day, so it's super expensive. But you can tune this K parameter to kind of trade off between cost and hopefully accuracy. You can go write that in a class. You just have to implement two functions. You can then plug it into Palimpsest, and Abacus will basically be able to optimize over it. Um, again, because it sort of treats every optimization as a black box that has to be sampled in order to assess its, its quality, its cost, and its latency. Um, so we definitely believe that, um, you know, this admits a lot of fancy optimizations. Like in theory, you could even have an optimization where the optimization is literally like an agent will write Python code, and then apply that Python code to do the filter operation. Uh, and perhaps in the Python code, it's invoking like some hugging face model, or if it's like simple enough of a filter, it can just be implemented with a regular expression or, uh, you know, other, like, string-matching logic. That's also possible. And then the great thing there is like, you write the function once, that's one LLM call, but then you apply the function with the rest of the dataset so you save a ton on, on your cost. And th- that's sort of the key, um, like biased perspective for the Palimpsest and Abacus system, is like you could go and write that optimization today, plug it into our system, and it would work. Um, you know, a- and, uh, so we're not, we're not being prescriptive about like what optimizations should exist in the system. We're not curating that. We're admitting the widest possible space, and then saying that we're gonna try... And our, our secret sauce is like, "Can we search over a massive space of these optimizations and find ones which are good for your court?" 240 | 241 | 00:58:48,786 --> 00:59:33,585 [Speaker 0] 242 | Yeah, that's so cool. And that, and that has made me think about like kind of the integration of DSPy where you might submit a prompt for like a t- one of these tasks, and then you optimize that prompt with Gepard as one of the optimizations you're exploring in parallel. And yeah, it's, it's really amazing. And then the systems thing, mentioning like each of the inferences is a RAG inference, now you maybe have this like batch query workload. And it's kind of something like when we were working with NVIDIA, they have like the CAGRA vector index for like running on GPUs and the... And so it's like... It's really particular... It's really particularly really good at building the v- proximity graph really quickly and then batch queries. And so I just always kind of been thinking now about like, oh, what is the application of like batch queries? And maybe this is that example because you're about to run 100,000 [laughs] RAG inferences. [laughs] 243 | 244 | 00:59:33,585 --> 00:59:34,935 [Speaker 1] 245 | Yeah, right. 246 | 247 | 00:59:34,935 --> 00:59:35,705 [Speaker 0] 248 | Or, or... [laughs] 249 | 250 | 00:59:35,705 --> 00:59:36,136 [Speaker 1] 251 | Yeah, yeah. 252 | 253 | 00:59:36,136 --> 00:59:36,316 [Speaker 0] 254 | Yeah. 255 | 256 | 00:59:36,316 --> 01:00:01,066 [Speaker 1] 257 | So it's, um, yeah. But it's a, it's an exciting space, um, and I think, uh... And that's, and that's one of the key things, is that the space of optimizations, because AI development happens so quickly, it's just gonna grow and grow. Um, and so we're, we're designing to try and be flexible, um, to admit new optimizations and new techniques as we get more great new papers about all these different AI processing techniques. 258 | 259 | 01:00:01,066 --> 01:00:20,586 [Speaker 0] 260 | Yeah, amazing. Well, Matthew, thanks so much for joining the VB podcast. I've learned so much about semantic query processing engines and all these things are just so exciting. Uh, one, one sort of anchoring question I'd like to ask, just maybe like broadly outside of your particular expertise in this work, is just sort of like what future directions for AI excite you the most? Like, what kind of gets you out of bed in the morning thinking about AI? 261 | 262 | 01:00:20,586 --> 01:00:27,696 [Speaker 1] 263 | Yeah. Um, I think, uh, I guess one very succinct one for me is, is deep research. Um, and so- 264 | 265 | 01:00:27,696 --> 01:00:27,696 [Speaker 0] 266 | Mm-hmm 267 | 268 | 01:00:27,696 --> 01:00:37,086 [Speaker 1] 269 | ... I've been really... Because, uh, at a very high level, deep research actually looks a lot like a declarative 270 | 271 | 01:00:37,086 --> 01:01:17,236 [Speaker 1] 272 | database processing system. Like, users give very high level intent. The system then has to take that intent, come up with some plan for how it's gonna, you know, answer the question that the user provided or do the task that the user wants it to do. Um, and then, you know, after 20 or 30 minutes, I think for current sy- systems, like, maybe less time, but, y- you know, the system will come back with your final output. But that maps very closely to this declarative world in which you say like, the user doesn't tell the deep research agent, "These are the steps you have to follow." They just say like, "Here's some Excel spreadsheets," like, "Come up with a financial model for investing in Fortune 500 companies," or something like that. Or like, you know- 273 | 274 | 01:01:17,236 --> 01:01:17,236 [Speaker 0] 275 | [laughs] 276 | 277 | 01:01:17,236 --> 01:01:21,955 [Speaker 1] 278 | ... we hope students don't do this, but I am sure students have done this, like, "Here's a chapter of my textbook." 279 | 280 | 01:01:21,955 --> 01:01:21,966 [Speaker 0] 281 | [laughs] 282 | 283 | 01:01:21,966 --> 01:01:47,306 [Speaker 1] 284 | Like, "Generate a study plan [laughs] for me." Um, and so, you know, what users care about is like, "I have this instruction, I want the output," and it's up to the system to optimize, like, how do you go from instruction to output? So I'm excited by that 'cause I think that it, it maps pretty closely to this space, and i- it remains to be seen what's gonna be the overlap or the parallels between semantic query processing engines and deep research. 285 | 286 | 01:01:47,306 --> 01:01:54,045 [Speaker 0] 287 | Yeah, I, I think maybe the generator study plan for the student [laughs] is such a, is such a bad use case of it. [laughs] 288 | 289 | 01:01:54,046 --> 01:01:57,045 [Speaker 1] 290 | [laughs] It's definitely in demand, I can say that. 291 | 292 | 01:01:57,046 --> 01:02:05,346 [Speaker 0] 293 | [laughs] But like, the, um, you know, get my, my tweet, get this a million impressions, whatever you have to do, or... [laughs] 294 | 295 | 01:02:05,346 --> 01:02:09,246 [Speaker 1] 296 | Yeah. Well, the, the opportunities for optimization are, are boundless. So that's good. 297 | 298 | 01:02:09,246 --> 01:02:09,906 [Speaker 0] 299 | [laughs] 300 | 301 | 01:02:09,906 --> 01:02:12,145 [Speaker 1] 302 | That's good for me as a researcher. 303 | 304 | 01:02:12,145 --> 01:02:24,286 [Speaker 0] 305 | [laughs] Yeah, amazing. Well, Matthew, thanks again for joining. Uh, just amazing, really loved learning about all these things from you. And yeah, just really excited to keep up with the progress of this. It's such an exciting field. Thanks so much. 306 | 307 | 01:02:24,286 --> 01:02:25,806 [Speaker 1] 308 | Thank you. Bye. --------------------------------------------------------------------------------