├── .gitignore ├── README.md ├── chunker.py ├── documents └── startupideas.txt ├── requirements.txt ├── summarization.ipynb ├── toy.ipynb └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb_checkpoints 2 | __pycache__ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Optimal Chunk Size for Large Document Summarization 2 | 3 | A method to choose the **optimal chunk size** for large document summarization. 4 | 5 | See our [blog post](https://vectify.ai/blog/LargeDocumentSummarization) for a detailed introduction. 6 | 7 | * [toy.ipynb](https://github.com/VectifyAI/LargeDocumentSummarization/blob/master/toy.ipynb) shows a toy chunking comparison. 8 | 9 | * [summarization.ipynb](https://github.com/VectifyAI/LargeDocumentSummarization/blob/master/summarization.ipynb) shows a practical large document summarization comparison. 10 | 11 | [Get connected](https://ii2abc2jejf.typeform.com/to/jiDMMXwX) to see how we could help to improve your LLM pipelines. 12 | 13 | By Vectify AI 14 | -------------------------------------------------------------------------------- /chunker.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | import math 3 | 4 | 5 | def get_token_size(document, model): 6 | tokenizer=tiktoken.encoding_for_model(model) 7 | return len(tokenizer.encode(document)) 8 | 9 | def naive_chunker(document, chunk_size, model): 10 | tokenizer=tiktoken.encoding_for_model(model) 11 | document_tokens=tokenizer.encode(document) 12 | document_size = len(document_tokens) 13 | 14 | chunks = [] 15 | for i in range(0, document_size, chunk_size): 16 | chunk = document_tokens[i:i + chunk_size] 17 | chunks.append(tokenizer.decode(chunk)) 18 | 19 | return chunks 20 | 21 | 22 | def auto_chunker(document, max_chunk_size, model): 23 | tokenizer = tiktoken.encoding_for_model(model) 24 | document_tokens = tokenizer.encode(document) 25 | document_size = len(document_tokens) 26 | # total chunk number 27 | K = math.ceil(document_size / max_chunk_size) 28 | # average integer chunk size 29 | average_chunk_size = math.ceil(document_size / K) 30 | # number of chunks with average_chunk_size - 1 31 | shorter_chunk_number = K * average_chunk_size - document_size 32 | # number of chunks with average_chunk_size 33 | standard_chunk_number = K - shorter_chunk_number 34 | 35 | chunks = [] 36 | chunk_start = 0 37 | for i in range(0, K): 38 | if i < standard_chunk_number: 39 | chunk_end = chunk_start + average_chunk_size 40 | else: 41 | chunk_end = chunk_start + average_chunk_size - 1 42 | chunk = document_tokens[chunk_start:chunk_end] 43 | chunks.append(tokenizer.decode(chunk)) 44 | chunk_start = chunk_end 45 | 46 | assert chunk_start == document_size 47 | return chunks -------------------------------------------------------------------------------- /documents/startupideas.txt: -------------------------------------------------------------------------------- 1 | In 1995 I started a company to put art galleries 2 | online. But galleries didn't want to be online. It's not how the 3 | art business works. So why did I spend 6 months working on this 4 | stupid idea? Because I didn't pay attention to users. I invented 5 | a model of the world that didn't correspond to reality, and worked 6 | from that. I didn't notice my model was wrong until I tried 7 | to convince users to pay for what we'd built. Even then I took 8 | embarrassingly long to catch on. I was attached to my model of the 9 | world, and I'd spent a lot of time on the software. They had to 10 | want it!Why do so many founders build things no one wants? Because they 11 | begin by trying to think of startup ideas. That m.o. is doubly 12 | dangerous: it doesn't merely yield few good ideas; it yields bad 13 | ideas that sound plausible enough to fool you into working on them.At YC we call these "made-up" or "sitcom" startup ideas. Imagine 14 | one of the characters on a TV show was starting a startup. The 15 | writers would have to invent something for it to do. But coming 16 | up with good startup ideas is hard. It's not something you can do 17 | for the asking. So (unless they got amazingly lucky) the writers 18 | would come up with an idea that sounded plausible, but was actually 19 | bad.For example, a social network for pet owners. It doesn't sound 20 | obviously mistaken. Millions of people have pets. Often they care 21 | a lot about their pets and spend a lot of money on them. Surely 22 | many of these people would like a site where they could talk to 23 | other pet owners. Not all of them perhaps, but if just 2 or 3 24 | percent were regular visitors, you could have millions of users. 25 | You could serve them targeted offers, and maybe charge for premium 26 | features. 27 | [1]The danger of an idea like this is that when you run it by your 28 | friends with pets, they don't say "I would never use this." They 29 | say "Yeah, maybe I could see using something like that." Even when 30 | the startup launches, it will sound plausible to a lot of people. 31 | They don't want to use it themselves, at least not right now, but 32 | they could imagine other people wanting it. Sum that reaction 33 | across the entire population, and you have zero users. 34 | [2] 35 | WellWhen a startup launches, there have to be at least some users who 36 | really need what they're making — not just people who could see 37 | themselves using it one day, but who want it urgently. Usually 38 | this initial group of users is small, for the simple reason that 39 | if there were something that large numbers of people urgently needed 40 | and that could be built with the amount of effort a startup usually 41 | puts into a version one, it would probably already exist. Which 42 | means you have to compromise on one dimension: you can either build 43 | something a large number of people want a small amount, or something 44 | a small number of people want a large amount. Choose the latter. 45 | Not all ideas of that type are good startup ideas, but nearly all 46 | good startup ideas are of that type.Imagine a graph whose x axis represents all the people who might 47 | want what you're making and whose y axis represents how much they 48 | want it. If you invert the scale on the y axis, you can envision 49 | companies as holes. Google is an immense crater: hundreds of 50 | millions of people use it, and they need it a lot. A startup just 51 | starting out can't expect to excavate that much volume. So you 52 | have two choices about the shape of hole you start with. You can 53 | either dig a hole that's broad but shallow, or one that's narrow 54 | and deep, like a well.Made-up startup ideas are usually of the first type. Lots of people 55 | are mildly interested in a social network for pet owners.Nearly all good startup ideas are of the second type. Microsoft 56 | was a well when they made Altair Basic. There were only a couple 57 | thousand Altair owners, but without this software they were programming 58 | in machine language. Thirty years later Facebook had the same 59 | shape. Their first site was exclusively for Harvard students, of 60 | which there are only a few thousand, but those few thousand users 61 | wanted it a lot.When you have an idea for a startup, ask yourself: who wants this 62 | right now? Who wants this so much that they'll use it even when 63 | it's a crappy version one made by a two-person startup they've never 64 | heard of? If you can't answer that, the idea is probably bad. 65 | [3]You don't need the narrowness of the well per se. It's depth you 66 | need; you get narrowness as a byproduct of optimizing for depth 67 | (and speed). But you almost always do get it. In practice the 68 | link between depth and narrowness is so strong that it's a good 69 | sign when you know that an idea will appeal strongly to a specific 70 | group or type of user.But while demand shaped like a well is almost a necessary condition 71 | for a good startup idea, it's not a sufficient one. If Mark 72 | Zuckerberg had built something that could only ever have appealed 73 | to Harvard students, it would not have been a good startup idea. 74 | Facebook was a good idea because it started with a small market 75 | there was a fast path out of. Colleges are similar enough that if 76 | you build a facebook that works at Harvard, it will work at any 77 | college. So you spread rapidly through all the colleges. Once you 78 | have all the college students, you get everyone else simply by 79 | letting them in.Similarly for Microsoft: Basic for the Altair; Basic for other 80 | machines; other languages besides Basic; operating systems; 81 | applications; IPO. 82 | SelfHow do you tell whether there's a path out of an idea? How do you 83 | tell whether something is the germ of a giant company, or just a 84 | niche product? Often you can't. The founders of Airbnb didn't 85 | realize at first how big a market they were tapping. Initially 86 | they had a much narrower idea. They were going to let hosts rent 87 | out space on their floors during conventions. They didn't foresee 88 | the expansion of this idea; it forced itself upon them gradually. 89 | All they knew at first is that they were onto something. That's 90 | probably as much as Bill Gates or Mark Zuckerberg knew at first.Occasionally it's obvious from the beginning when there's a path 91 | out of the initial niche. And sometimes I can see a path that's 92 | not immediately obvious; that's one of our specialties at YC. But 93 | there are limits to how well this can be done, no matter how much 94 | experience you have. The most important thing to understand about 95 | paths out of the initial idea is the meta-fact that these are hard 96 | to see.So if you can't predict whether there's a path out of an idea, how 97 | do you choose between ideas? The truth is disappointing but 98 | interesting: if you're the right sort of person, you have the right 99 | sort of hunches. If you're at the leading edge of a field that's 100 | changing fast, when you have a hunch that something is worth doing, 101 | you're more likely to be right.In Zen and the Art of Motorcycle Maintenance, Robert Pirsig says: 102 | 103 | You want to know how to paint a perfect painting? It's easy. Make 104 | yourself perfect and then just paint naturally. 105 | 106 | I've wondered about that passage since I read it in high school. 107 | I'm not sure how useful his advice is for painting specifically, 108 | but it fits this situation well. Empirically, the way to have good 109 | startup ideas is to become the sort of person who has them.Being at the leading edge of a field doesn't mean you have to be 110 | one of the people pushing it forward. You can also be at the leading 111 | edge as a user. It was not so much because he was a programmer 112 | that Facebook seemed a good idea to Mark Zuckerberg as because he 113 | used computers so much. If you'd asked most 40 year olds in 2004 114 | whether they'd like to publish their lives semi-publicly on the 115 | Internet, they'd have been horrified at the idea. But Mark already 116 | lived online; to him it seemed natural.Paul Buchheit says that people at the leading edge of a rapidly 117 | changing field "live in the future." Combine that with Pirsig and 118 | you get: 119 | 120 | Live in the future, then build what's missing. 121 | 122 | That describes the way many if not most of the biggest startups got 123 | started. Neither Apple nor Yahoo nor Google nor Facebook were even 124 | supposed to be companies at first. They grew out of things their 125 | founders built because there seemed a gap in the world.If you look at the way successful founders have had their ideas, 126 | it's generally the result of some external stimulus hitting a 127 | prepared mind. Bill Gates and Paul Allen hear about the Altair and 128 | think "I bet we could write a Basic interpreter for it." Drew Houston 129 | realizes he's forgotten his USB stick and thinks "I really need to 130 | make my files live online." Lots of people heard about the Altair. 131 | Lots forgot USB sticks. The reason those stimuli caused those 132 | founders to start companies was that their experiences had prepared 133 | them to notice the opportunities they represented.The verb you want to be using with respect to startup ideas is not 134 | "think up" but "notice." At YC we call ideas that grow naturally 135 | out of the founders' own experiences "organic" startup ideas. The 136 | most successful startups almost all begin this way.That may not have been what you wanted to hear. You may have 137 | expected recipes for coming up with startup ideas, and instead I'm 138 | telling you that the key is to have a mind that's prepared in the 139 | right way. But disappointing though it may be, this is the truth. 140 | And it is a recipe of a sort, just one that in the worst case takes 141 | a year rather than a weekend.If you're not at the leading edge of some rapidly changing field, 142 | you can get to one. For example, anyone reasonably smart can 143 | probably get to an edge of programming (e.g. building mobile apps) 144 | in a year. Since a successful startup will consume at least 3-5 145 | years of your life, a year's preparation would be a reasonable 146 | investment. Especially if you're also looking for a cofounder. 147 | [4]You don't have to learn programming to be at the leading edge of a 148 | domain that's changing fast. Other domains change fast. But while 149 | learning to hack is not necessary, it is for the forseeable future 150 | sufficient. As Marc Andreessen put it, software is eating the world, 151 | and this trend has decades left to run.Knowing how to hack also means that when you have ideas, you'll be 152 | able to implement them. That's not absolutely necessary (Jeff Bezos 153 | couldn't) but it's an advantage. It's a big advantage, when you're 154 | considering an idea like putting a college facebook online, if 155 | instead of merely thinking "That's an interesting idea," you can 156 | think instead "That's an interesting idea. I'll try building an 157 | initial version tonight." It's even better when you're both a 158 | programmer and the target user, because then the cycle of generating 159 | new versions and testing them on users can happen inside one head. 160 | NoticingOnce you're living in the future in some respect, the way to notice 161 | startup ideas is to look for things that seem to be missing. If 162 | you're really at the leading edge of a rapidly changing field, there 163 | will be things that are obviously missing. What won't be obvious 164 | is that they're startup ideas. So if you want to find startup 165 | ideas, don't merely turn on the filter "What's missing?" Also turn 166 | off every other filter, particularly "Could this be a big company?" 167 | There's plenty of time to apply that test later. But if you're 168 | thinking about that initially, it may not only filter out lots 169 | of good ideas, but also cause you to focus on bad ones.Most things that are missing will take some time to see. You almost 170 | have to trick yourself into seeing the ideas around you.But you know the ideas are out there. This is not one of those 171 | problems where there might not be an answer. It's impossibly 172 | unlikely that this is the exact moment when technological progress 173 | stops. You can be sure people are going to build things in the 174 | next few years that will make you think "What did I do before x?"And when these problems get solved, they will probably seem flamingly 175 | obvious in retrospect. What you need to do is turn off the filters 176 | that usually prevent you from seeing them. The most powerful is 177 | simply taking the current state of the world for granted. Even the 178 | most radically open-minded of us mostly do that. You couldn't get 179 | from your bed to the front door if you stopped to question everything.But if you're looking for startup ideas you can sacrifice some of 180 | the efficiency of taking the status quo for granted and start to 181 | question things. Why is your inbox overflowing? Because you get 182 | a lot of email, or because it's hard to get email out of your inbox? 183 | Why do you get so much email? What problems are people trying to 184 | solve by sending you email? Are there better ways to solve them? 185 | And why is it hard to get emails out of your inbox? Why do you 186 | keep emails around after you've read them? Is an inbox the optimal 187 | tool for that?Pay particular attention to things that chafe you. The advantage 188 | of taking the status quo for granted is not just that it makes life 189 | (locally) more efficient, but also that it makes life more tolerable. 190 | If you knew about all the things we'll get in the next 50 years but 191 | don't have yet, you'd find present day life pretty constraining, 192 | just as someone from the present would if they were sent back 50 193 | years in a time machine. When something annoys you, it could be 194 | because you're living in the future.When you find the right sort of problem, you should probably be 195 | able to describe it as obvious, at least to you. When we started 196 | Viaweb, all the online stores were built by hand, by web designers 197 | making individual HTML pages. It was obvious to us as programmers 198 | that these sites would have to be generated by software. 199 | [5]Which means, strangely enough, that coming up with startup ideas 200 | is a question of seeing the obvious. That suggests how weird this 201 | process is: you're trying to see things that are obvious, and yet 202 | that you hadn't seen.Since what you need to do here is loosen up your own mind, it may 203 | be best not to make too much of a direct frontal attack on the 204 | problem — i.e. to sit down and try to think of ideas. The best 205 | plan may be just to keep a background process running, looking for 206 | things that seem to be missing. Work on hard problems, driven 207 | mainly by curiosity, but have a second self watching over your 208 | shoulder, taking note of gaps and anomalies. 209 | [6]Give yourself some time. You have a lot of control over the rate 210 | at which you turn yours into a prepared mind, but you have less 211 | control over the stimuli that spark ideas when they hit it. If 212 | Bill Gates and Paul Allen had constrained themselves to come up 213 | with a startup idea in one month, what if they'd chosen a month 214 | before the Altair appeared? They probably would have worked on a 215 | less promising idea. Drew Houston did work on a less promising 216 | idea before Dropbox: an SAT prep startup. But Dropbox was a much 217 | better idea, both in the absolute sense and also as a match for his 218 | skills. 219 | [7]A good way to trick yourself into noticing ideas is to work on 220 | projects that seem like they'd be cool. If you do that, you'll 221 | naturally tend to build things that are missing. It wouldn't seem 222 | as interesting to build something that already existed.Just as trying to think up startup ideas tends to produce bad ones, 223 | working on things that could be dismissed as "toys" often produces 224 | good ones. When something is described as a toy, that means it has 225 | everything an idea needs except being important. It's cool; users 226 | love it; it just doesn't matter. But if you're living in the future 227 | and you build something cool that users love, it may matter more 228 | than outsiders think. Microcomputers seemed like toys when Apple 229 | and Microsoft started working on them. I'm old enough to remember 230 | that era; the usual term for people with their own microcomputers 231 | was "hobbyists." BackRub seemed like an inconsequential science 232 | project. The Facebook was just a way for undergrads to stalk one 233 | another.At YC we're excited when we meet startups working on things that 234 | we could imagine know-it-alls on forums dismissing as toys. To us 235 | that's positive evidence an idea is good.If you can afford to take a long view (and arguably you can't afford 236 | not to), you can turn "Live in the future and build what's missing" 237 | into something even better: 238 | 239 | Live in the future and build what seems interesting. 240 | 241 | SchoolThat's what I'd advise college students to do, rather than trying 242 | to learn about "entrepreneurship." "Entrepreneurship" is something 243 | you learn best by doing it. The examples of the most successful 244 | founders make that clear. What you should be spending your time 245 | on in college is ratcheting yourself into the future. College is 246 | an incomparable opportunity to do that. What a waste to sacrifice 247 | an opportunity to solve the hard part of starting a startup — becoming 248 | the sort of person who can have organic startup ideas — by 249 | spending time learning about the easy part. Especially since 250 | you won't even really learn about it, any more than you'd learn 251 | about sex in a class. All you'll learn is the words for things.The clash of domains is a particularly fruitful source of ideas. 252 | If you know a lot about programming and you start learning about 253 | some other field, you'll probably see problems that software could 254 | solve. In fact, you're doubly likely to find good problems in 255 | another domain: (a) the inhabitants of that domain are not as likely 256 | as software people to have already solved their problems with 257 | software, and (b) since you come into the new domain totally ignorant, 258 | you don't even know what the status quo is to take it for granted.So if you're a CS major and you want to start a startup, instead 259 | of taking a class on entrepreneurship you're better off taking a 260 | class on, say, genetics. Or better still, go work for a biotech 261 | company. CS majors normally get summer jobs at computer hardware 262 | or software companies. But if you want to find startup ideas, you 263 | might do better to get a summer job in some unrelated field. 264 | [8]Or don't take any extra classes, and just build things. It's no 265 | coincidence that Microsoft and Facebook both got started in January. 266 | At Harvard that is (or was) Reading Period, when students have no 267 | classes to attend because they're supposed to be studying for finals. 268 | [9]But don't feel like you have to build things that will become startups. That's 269 | premature optimization. Just build things. Preferably with other 270 | students. It's not just the classes that make a university such a 271 | good place to crank oneself into the future. You're also surrounded 272 | by other people trying to do the same thing. If you work together 273 | with them on projects, you'll end up producing not just organic 274 | ideas, but organic ideas with organic founding teams — and that, 275 | empirically, is the best combination.Beware of research. If an undergrad writes something all his friends 276 | start using, it's quite likely to represent a good startup idea. 277 | Whereas a PhD dissertation is extremely unlikely to. For some 278 | reason, the more a project has to count as research, the less likely 279 | it is to be something that could be turned into a startup. 280 | [10] 281 | I think the reason is that the subset of ideas that count as research 282 | is so narrow that it's unlikely that a project that satisfied that 283 | constraint would also satisfy the orthogonal constraint of solving 284 | users' problems. Whereas when students (or professors) build 285 | something as a side-project, they automatically gravitate toward 286 | solving users' problems — perhaps even with an additional energy 287 | that comes from being freed from the constraints of research. 288 | CompetitionBecause a good idea should seem obvious, when you have one you'll 289 | tend to feel that you're late. Don't let that deter you. Worrying 290 | that you're late is one of the signs of a good idea. Ten minutes 291 | of searching the web will usually settle the question. Even if you 292 | find someone else working on the same thing, you're probably not 293 | too late. It's exceptionally rare for startups to be killed by 294 | competitors — so rare that you can almost discount the possibility. 295 | So unless you discover a competitor with the sort of lock-in that 296 | would prevent users from choosing you, don't discard the idea.If you're uncertain, ask users. The question of whether you're too 297 | late is subsumed by the question of whether anyone urgently needs 298 | what you plan to make. If you have something that no competitor 299 | does and that some subset of users urgently need, you have a 300 | beachhead. 301 | [11]The question then is whether that beachhead is big enough. Or more 302 | importantly, who's in it: if the beachhead consists of people doing 303 | something lots more people will be doing in the future, then it's 304 | probably big enough no matter how small it is. For example, if 305 | you're building something differentiated from competitors by the 306 | fact that it works on phones, but it only works on the newest phones, 307 | that's probably a big enough beachhead.Err on the side of doing things where you'll face competitors. 308 | Inexperienced founders usually give competitors more credit than 309 | they deserve. Whether you succeed depends far more on you than on 310 | your competitors. So better a good idea with competitors than a 311 | bad one without.You don't need to worry about entering a "crowded market" so long 312 | as you have a thesis about what everyone else in it is overlooking. 313 | In fact that's a very promising starting point. Google was that 314 | type of idea. Your thesis has to be more precise than "we're going 315 | to make an x that doesn't suck" though. You have to be able to 316 | phrase it in terms of something the incumbents are overlooking. 317 | Best of all is when you can say that they didn't have the courage 318 | of their convictions, and that your plan is what they'd have done 319 | if they'd followed through on their own insights. Google was that 320 | type of idea too. The search engines that preceded them shied away 321 | from the most radical implications of what they were doing — particularly 322 | that the better a job they did, the faster users would 323 | leave.A crowded market is actually a good sign, because it means both 324 | that there's demand and that none of the existing solutions are 325 | good enough. A startup can't hope to enter a market that's obviously 326 | big and yet in which they have no competitors. So any startup that 327 | succeeds is either going to be entering a market with existing 328 | competitors, but armed with some secret weapon that will get them 329 | all the users (like Google), or entering a market that looks small 330 | but which will turn out to be big (like Microsoft). 331 | [12] 332 | FiltersThere are two more filters you'll need to turn off if you want to 333 | notice startup ideas: the unsexy filter and the schlep filter.Most programmers wish they could start a startup by just writing 334 | some brilliant code, pushing it to a server, and having users pay 335 | them lots of money. They'd prefer not to deal with tedious problems 336 | or get involved in messy ways with the real world. Which is a 337 | reasonable preference, because such things slow you down. But this 338 | preference is so widespread that the space of convenient startup 339 | ideas has been stripped pretty clean. If you let your mind wander 340 | a few blocks down the street to the messy, tedious ideas, you'll 341 | find valuable ones just sitting there waiting to be implemented.The schlep filter is so dangerous that I wrote a separate essay 342 | about the condition it induces, which I called 343 | schlep blindness. 344 | I gave Stripe as an example of a startup that benefited from turning 345 | off this filter, and a pretty striking example it is. Thousands 346 | of programmers were in a position to see this idea; thousands of 347 | programmers knew how painful it was to process payments before 348 | Stripe. But when they looked for startup ideas they didn't see 349 | this one, because unconsciously they shrank from having to deal 350 | with payments. And dealing with payments is a schlep for Stripe, 351 | but not an intolerable one. In fact they might have had net less 352 | pain; because the fear of dealing with payments kept most people 353 | away from this idea, Stripe has had comparatively smooth sailing 354 | in other areas that are sometimes painful, like user acquisition. 355 | They didn't have to try very hard to make themselves heard by users, 356 | because users were desperately waiting for what they were building.The unsexy filter is similar to the schlep filter, except it keeps 357 | you from working on problems you despise rather than ones you fear. 358 | We overcame this one to work on Viaweb. There were interesting 359 | things about the architecture of our software, but we weren't 360 | interested in ecommerce per se. We could see the problem was one 361 | that needed to be solved though.Turning off the schlep filter is more important than turning off 362 | the unsexy filter, because the schlep filter is more likely to be 363 | an illusion. And even to the degree it isn't, it's a worse form 364 | of self-indulgence. Starting a successful startup is going to be 365 | fairly laborious no matter what. Even if the product doesn't entail 366 | a lot of schleps, you'll still have plenty dealing with investors, 367 | hiring and firing people, and so on. So if there's some idea you 368 | think would be cool but you're kept away from by fear of the schleps 369 | involved, don't worry: any sufficiently good idea will have as many.The unsexy filter, while still a source of error, is not as entirely 370 | useless as the schlep filter. If you're at the leading edge of a 371 | field that's changing rapidly, your ideas about what's sexy will 372 | be somewhat correlated with what's valuable in practice. Particularly 373 | as you get older and more experienced. Plus if you find an idea 374 | sexy, you'll work on it more enthusiastically. 375 | [13] 376 | RecipesWhile the best way to discover startup ideas is to become the sort 377 | of person who has them and then build whatever interests you, 378 | sometimes you don't have that luxury. Sometimes you need an idea 379 | now. For example, if you're working on a startup and your initial 380 | idea turns out to be bad.For the rest of this essay I'll talk about tricks for coming up 381 | with startup ideas on demand. Although empirically you're better 382 | off using the organic strategy, you could succeed this way. You 383 | just have to be more disciplined. When you use the organic method, 384 | you don't even notice an idea unless it's evidence that something 385 | is truly missing. But when you make a conscious effort to think 386 | of startup ideas, you have to replace this natural constraint with 387 | self-discipline. You'll see a lot more ideas, most of them bad, 388 | so you need to be able to filter them.One of the biggest dangers of not using the organic method is the 389 | example of the organic method. Organic ideas feel like inspirations. 390 | There are a lot of stories about successful startups that began 391 | when the founders had what seemed a crazy idea but "just knew" it 392 | was promising. When you feel that about an idea you've had while 393 | trying to come up with startup ideas, you're probably mistaken.When searching for ideas, look in areas where you have some expertise. 394 | If you're a database expert, don't build a chat app for teenagers 395 | (unless you're also a teenager). Maybe it's a good idea, but you 396 | can't trust your judgment about that, so ignore it. There have to 397 | be other ideas that involve databases, and whose quality you can 398 | judge. Do you find it hard to come up with good ideas involving 399 | databases? That's because your expertise raises your standards. 400 | Your ideas about chat apps are just as bad, but you're giving 401 | yourself a Dunning-Kruger pass in that domain.The place to start looking for ideas is things you need. There 402 | must be things you need. 403 | [14]One good trick is to ask yourself whether in your previous job you 404 | ever found yourself saying "Why doesn't someone make x? If someone 405 | made x we'd buy it in a second." If you can think of any x people 406 | said that about, you probably have an idea. You know there's demand, 407 | and people don't say that about things that are impossible to build.More generally, try asking yourself whether there's something unusual 408 | about you that makes your needs different from most other people's. 409 | You're probably not the only one. It's especially good if you're 410 | different in a way people will increasingly be.If you're changing ideas, one unusual thing about you is the idea 411 | you'd previously been working on. Did you discover any needs while 412 | working on it? Several well-known startups began this way. Hotmail 413 | began as something its founders wrote to talk about their previous 414 | startup idea while they were working at their day jobs. 415 | [15]A particularly promising way to be unusual is to be young. Some 416 | of the most valuable new ideas take root first among people in their 417 | teens and early twenties. And while young founders are at a 418 | disadvantage in some respects, they're the only ones who really 419 | understand their peers. It would have been very hard for someone 420 | who wasn't a college student to start Facebook. So if you're a 421 | young founder (under 23 say), are there things you and your friends 422 | would like to do that current technology won't let you?The next best thing to an unmet need of your own is an unmet need 423 | of someone else. Try talking to everyone you can about the gaps 424 | they find in the world. What's missing? What would they like to 425 | do that they can't? What's tedious or annoying, particularly in 426 | their work? Let the conversation get general; don't be trying too 427 | hard to find startup ideas. You're just looking for something to 428 | spark a thought. Maybe you'll notice a problem they didn't consciously 429 | realize they had, because you know how to solve it.When you find an unmet need that isn't your own, it may be somewhat 430 | blurry at first. The person who needs something may not know exactly 431 | what they need. In that case I often recommend that founders act 432 | like consultants — that they do what they'd do if they'd been 433 | retained to solve the problems of this one user. People's problems 434 | are similar enough that nearly all the code you write this way will 435 | be reusable, and whatever isn't will be a small price to start out 436 | certain that you've reached the bottom of the well. 437 | [16]One way to ensure you do a good job solving other people's problems 438 | is to make them your own. When Rajat Suri of E la Carte decided 439 | to write software for restaurants, he got a job as a waiter to learn 440 | how restaurants worked. That may seem like taking things to extremes, 441 | but startups are extreme. We love it when founders do such things.In fact, one strategy I recommend to people who need a new idea is 442 | not merely to turn off their schlep and unsexy filters, but to seek 443 | out ideas that are unsexy or involve schleps. Don't try to start 444 | Twitter. Those ideas are so rare that you can't find them by looking 445 | for them. Make something unsexy that people will pay you for.A good trick for bypassing the schlep and to some extent the unsexy 446 | filter is to ask what you wish someone else would build, so that 447 | you could use it. What would you pay for right now?Since startups often garbage-collect broken companies and industries, 448 | it can be a good trick to look for those that are dying, or deserve 449 | to, and try to imagine what kind of company would profit from their 450 | demise. For example, journalism is in free fall at the moment. 451 | But there may still be money to be made from something like journalism. 452 | What sort of company might cause people in the future to say "this 453 | replaced journalism" on some axis?But imagine asking that in the future, not now. When one company 454 | or industry replaces another, it usually comes in from the side. 455 | So don't look for a replacement for x; look for something that 456 | people will later say turned out to be a replacement for x. And 457 | be imaginative about the axis along which the replacement occurs. 458 | Traditional journalism, for example, is a way for readers to get 459 | information and to kill time, a way for writers to make money and 460 | to get attention, and a vehicle for several different types of 461 | advertising. It could be replaced on any of these axes (it has 462 | already started to be on most).When startups consume incumbents, they usually start by serving 463 | some small but important market that the big players ignore. It's 464 | particularly good if there's an admixture of disdain in the big 465 | players' attitude, because that often misleads them. For example, 466 | after Steve Wozniak built the computer that became the Apple I, he 467 | felt obliged to give his then-employer Hewlett-Packard the option 468 | to produce it. Fortunately for him, they turned it down, and one 469 | of the reasons they did was that it used a TV for a monitor, which 470 | seemed intolerably déclassé to a high-end hardware company like HP 471 | was at the time. 472 | [17]Are there groups of 473 | scruffy 474 | but sophisticated users like the early 475 | microcomputer "hobbyists" that are currently being ignored by the 476 | big players? A startup with its sights set on bigger things can 477 | often capture a small market easily by expending an effort that 478 | wouldn't be justified by that market alone.Similarly, since the most successful startups generally ride some 479 | wave bigger than themselves, it could be a good trick to look for 480 | waves and ask how one could benefit from them. The prices of gene 481 | sequencing and 3D printing are both experiencing Moore's Law-like 482 | declines. What new things will we be able to do in the new world 483 | we'll have in a few years? What are we unconsciously ruling out 484 | as impossible that will soon be possible? 485 | OrganicBut talking about looking explicitly for waves makes it clear that 486 | such recipes are plan B for getting startup ideas. Looking for 487 | waves is essentially a way to simulate the organic method. If 488 | you're at the leading edge of some rapidly changing field, you don't 489 | have to look for waves; you are the wave.Finding startup ideas is a subtle business, and that's why most 490 | people who try fail so miserably. It doesn't work well simply to 491 | try to think of startup ideas. If you do that, you get bad ones 492 | that sound dangerously plausible. The best approach is more indirect: 493 | if you have the right sort of background, good startup ideas will 494 | seem obvious to you. But even then, not immediately. It takes 495 | time to come across situations where you notice something missing. 496 | And often these gaps won't seem to be ideas for companies, just 497 | things that would be interesting to build. Which is why it's good 498 | to have the time and the inclination to build things just because 499 | they're interesting.Live in the future and build what seems interesting. Strange as 500 | it sounds, that's the real recipe. 501 | Notes[1] 502 | This form of bad idea has been around as long as the web. It 503 | was common in the 1990s, except then people who had it used to say 504 | they were going to create a portal for x instead of a social network 505 | for x. Structurally the idea is stone soup: you post a sign saying 506 | "this is the place for people interested in x," and all those people 507 | show up and you make money from them. What lures founders into 508 | this sort of idea are statistics about the millions of people who 509 | might be interested in each type of x. What they forget is that 510 | any given person might have 20 affinities by this standard, and no 511 | one is going to visit 20 different communities regularly.[2] 512 | I'm not saying, incidentally, that I know for sure a social 513 | network for pet owners is a bad idea. I know it's a bad idea the 514 | way I know randomly generated DNA would not produce a viable organism. 515 | The set of plausible sounding startup ideas is many times larger 516 | than the set of good ones, and many of the good ones don't even 517 | sound that plausible. So if all you know about a startup idea is 518 | that it sounds plausible, you have to assume it's bad.[3] 519 | More precisely, the users' need has to give them sufficient 520 | activation energy to start using whatever you make, which can vary 521 | a lot. For example, the activation energy for enterprise software 522 | sold through traditional channels is very high, so you'd have to 523 | be a lot better to get users to switch. Whereas the activation 524 | energy required to switch to a new search engine is low. Which in 525 | turn is why search engines are so much better than enterprise 526 | software.[4] 527 | This gets harder as you get older. While the space of ideas 528 | doesn't have dangerous local maxima, the space of careers does. 529 | There are fairly high walls between most of the paths people take 530 | through life, and the older you get, the higher the walls become.[5] 531 | It was also obvious to us that the web was going to be a big 532 | deal. Few non-programmers grasped that in 1995, but the programmers 533 | had seen what GUIs had done for desktop computers.[6] 534 | Maybe it would work to have this second self keep a journal, 535 | and each night to make a brief entry listing the gaps and anomalies 536 | you'd noticed that day. Not startup ideas, just the raw gaps and 537 | anomalies.[7] 538 | Sam Altman points out that taking time to come up with an 539 | idea is not merely a better strategy in an absolute sense, but also 540 | like an undervalued stock in that so few founders do it.There's comparatively little competition for the best ideas, because 541 | few founders are willing to put in the time required to notice them. 542 | Whereas there is a great deal of competition for mediocre ideas, 543 | because when people make up startup ideas, they tend to make up the 544 | same ones.[8] 545 | For the computer hardware and software companies, summer jobs 546 | are the first phase of the recruiting funnel. But if you're good 547 | you can skip the first phase. If you're good you'll have no trouble 548 | getting hired by these companies when you graduate, regardless of 549 | how you spent your summers.[9] 550 | The empirical evidence suggests that if colleges want to help 551 | their students start startups, the best thing they can do is leave 552 | them alone in the right way.[10] 553 | I'm speaking here of IT startups; in biotech things are different.[11] 554 | This is an instance of a more general rule: focus on users, 555 | not competitors. The most important information about competitors 556 | is what you learn via users anyway.[12] 557 | In practice most successful startups have elements of both. 558 | And you can describe each strategy in terms of the other by adjusting 559 | the boundaries of what you call the market. But it's useful to 560 | consider these two ideas separately.[13] 561 | I almost hesitate to raise that point though. Startups are 562 | businesses; the point of a business is to make money; and with that 563 | additional constraint, you can't expect you'll be able to spend all 564 | your time working on what interests you most.[14] 565 | The need has to be a strong one. You can retroactively 566 | describe any made-up idea as something you need. But do you really 567 | need that recipe site or local event aggregator as much as Drew 568 | Houston needed Dropbox, or Brian Chesky and Joe Gebbia needed Airbnb?Quite often at YC I find myself asking founders "Would you use this 569 | thing yourself, if you hadn't written it?" and you'd be surprised 570 | how often the answer is no.[15] 571 | Paul Buchheit points out that trying to sell something bad 572 | can be a source of better ideas:"The best technique I've found for dealing with YC companies that 573 | have bad ideas is to tell them to go sell the product ASAP (before 574 | wasting time building it). Not only do they learn that nobody 575 | wants what they are building, they very often come back with a 576 | real idea that they discovered in the process of trying to sell 577 | the bad idea."[16] 578 | Here's a recipe that might produce the next Facebook, if 579 | you're college students. If you have a connection to one of the 580 | more powerful sororities at your school, approach the queen bees 581 | thereof and offer to be their personal IT consultants, building 582 | anything they could imagine needing in their social lives that 583 | didn't already exist. Anything that got built this way would be 584 | very promising, because such users are not just the most demanding 585 | but also the perfect point to spread from.I have no idea whether this would work.[17] 586 | And the reason it used a TV for a monitor is that Steve Wozniak 587 | started out by solving his own problems. He, like most of his 588 | peers, couldn't afford a monitor.Thanks to Sam Altman, Mike Arrington, Paul Buchheit, John Collison, 589 | Patrick Collison, Garry Tan, and Harj Taggar for reading drafts of 590 | this, and Marc Andreessen, Joe Gebbia, Reid Hoffman, Shel Kaphan, 591 | Mike Moritz and Kevin Systrom for answering my questions about 592 | startup history. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai==0.27.6 2 | tiktoken==0.4.0 3 | -------------------------------------------------------------------------------- /summarization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Optimal Chunk-size for Large Document Summarization\n", 8 | "\n", 9 | "### Large document summarization" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "The document has a size of: 9160\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "from chunker import naive_chunker, auto_chunker,get_token_size\n", 27 | "from utils import get_chunk_summary, get_global_summary\n", 28 | "import textwrap\n", 29 | "\n", 30 | "OPENAI_KEY=\"your key here\"\n", 31 | "MODEL='gpt-3.5-turbo'\n", 32 | "\n", 33 | "with open('./documents/startupideas.txt', 'r') as file:\n", 34 | " test_document=file.read()\n", 35 | "\n", 36 | "print('The document has a size of:',get_token_size(test_document, MODEL))" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "#### Biased Global Summarization with Naive chunking method\n", 44 | "We intentionally selected a chunk size of 3000 to highlight the problem more clearly." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "CHUNK_SIZE=3000\n", 54 | "naive_chunks=naive_chunker(test_document, CHUNK_SIZE, MODEL)\n", 55 | "naive_chunk_summaries=[get_chunk_summary(chunk, OPENAI_KEY, MODEL) for chunk in naive_chunks]\n", 56 | "naive_global_summary=get_global_summary(naive_chunk_summaries,OPENAI_KEY, MODEL)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 9, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "chunk size list: [3000, 3000, 3000, 160] \n", 69 | "\n", 70 | "last chunk text: already exist. Anything that got built this way would be\n", 71 | "very promising, because such users are not just the most demanding\n", 72 | "but also the perfect point to spread from.I have no idea whether this would work.[17]\n", 73 | "And the reason it used a TV for a monitor is that Steve Wozniak\n", 74 | "started out by solving his own problems. He, like most of his\n", 75 | "peers, couldn't afford a monitor.Thanks to Sam Altman, Mike Arrington, Paul Buchheit, John Collison,\n", 76 | "Patrick Collison, Garry Tan, and Harj Taggar for reading drafts of\n", 77 | "this, and Marc Andreessen, Joe Gebbia, Reid Hoffman, Shel Kaphan,\n", 78 | "Mike Moritz and Kevin Systrom for answering my questions about\n", 79 | "startup history. \n", 80 | "\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "print('chunk size list:',[get_token_size(chunk, MODEL) for chunk in naive_chunks], '\\n')\n", 86 | "print('last chunk text:',naive_chunks[-1], '\\n')" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "**The final chunk has a size of 160, which ideally shouldn't contribute any significant information to the global summary.**" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "global summary: The document discusses the importance of understanding user needs and creating\n", 106 | "products that fulfill those needs in order to build successful startups. It\n", 107 | "emphasizes the danger of building products that no one wants and the need to\n", 108 | "focus on a specific group of users who urgently need the product. The chunk also\n", 109 | "mentions the importance of being at the leading edge of a rapidly changing field\n", 110 | "and having a mind that is prepared to notice opportunities for startup ideas. It\n", 111 | "suggests that startup ideas should come from the founders' own experiences and\n", 112 | "that the most successful startups begin organically. The document also discusses\n", 113 | "the process of coming up with startup ideas, emphasizing the importance of\n", 114 | "living in the future and identifying what is missing in the present. It advises\n", 115 | "against focusing too much on entrepreneurship education and instead recommends\n", 116 | "gaining knowledge in different fields to identify problems that software can\n", 117 | "solve. The document provides advice and strategies for finding startup ideas,\n", 118 | "suggesting looking for ideas in areas where you have expertise, identifying\n", 119 | "unmet needs of yourself or others, and seeking out waves or trends that could\n", 120 | "lead to new opportunities. It also emphasizes the importance of being open-\n", 121 | "minded, observing gaps and anomalies, and focusing on users rather than\n", 122 | "competitors. The chunk mentions the potential of building something that already\n", 123 | "exists and targeting demanding users who can spread the word. It explains why\n", 124 | "Steve Wozniak used a TV as a monitor when starting out. The document concludes\n", 125 | "by encouraging individuals to question the status quo and look for things that\n", 126 | "are missing in order to find startup ideas.\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "print('global summary:', textwrap.fill(naive_global_summary, 80))" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "**However, in the produced global summary, it erroneously emphasizes the content of this final chunk:** \n", 139 | "\n", 140 | ">\"It explains why Steve Wozniak used a TV as a monitor when starting out. The document concludes by encouraging individuals to question the status quo and look for things that are missing in order to find startup ideas.\"" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "### Chunking with automatic chunk size determination\n", 148 | "\n", 149 | "We use the same chunk size 3000 as our MAX_CHUNK_SIZE." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 3, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "MAX_CHUNK_SIZE=3000\n", 159 | "auto_chunks=auto_chunker(test_document, MAX_CHUNK_SIZE, MODEL)\n", 160 | "auto_chunk_summaries=[get_chunk_summary(chunk, OPENAI_KEY, MODEL) for chunk in auto_chunks]\n", 161 | "auto_global_summary=get_global_summary(auto_chunk_summaries,OPENAI_KEY, MODEL)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 4, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "chunk size list: [2290, 2290, 2290, 2290] \n", 174 | "\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "print('chunk size list:',[get_token_size(chunk, MODEL) for chunk in auto_chunks],'\\n')" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "**We can see the chunk sizes are more balanced with our method.**" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 5, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "global summary: The author reflects on their experience starting a company and emphasizes the\n", 199 | "importance of paying attention to users' needs. They discuss the danger of\n", 200 | "\"made-up\" startup ideas that sound plausible but do not have a market demand.\n", 201 | "The author suggests that good startup ideas are those that address a specific\n", 202 | "group or type of user and have a path for growth. They also mention that being\n", 203 | "at the leading edge of a field or having experiences that prepare the mind to\n", 204 | "notice opportunities can lead to successful startup ideas. The author concludes\n", 205 | "by stating that preparation and being in the right mindset are key factors in\n", 206 | "generating organic startup ideas. The document chunk discusses the process of\n", 207 | "finding startup ideas. It emphasizes the importance of being at the leading edge\n", 208 | "of a rapidly changing field and noticing things that are missing. It suggests\n", 209 | "questioning the status quo and focusing on problems that annoy or challenge you.\n", 210 | "The document also mentions the benefits of working on projects that seem\n", 211 | "interesting or like \"toys,\" as they often lead to good startup ideas. It advises\n", 212 | "college students to focus on ratcheting themselves into the future rather than\n", 213 | "learning about entrepreneurship in a classroom setting. The document also\n", 214 | "mentions the clash of domains as a source of ideas and suggests working in\n", 215 | "unrelated fields to find startup ideas. It warns against focusing too much on\n", 216 | "research and highlights the importance of solving users' problems. Finally, the\n", 217 | "document mentions that feeling late to an idea is a sign of a good idea and\n", 218 | "encourages not to be deterred by competition. The document chunk discusses\n", 219 | "various factors to consider when coming up with startup ideas. It emphasizes\n", 220 | "that competition should not be a major concern, as success depends more on the\n", 221 | "founder's abilities and the urgency of the problem being solved. It also\n", 222 | "suggests turning off filters such as the unsexy filter and the schlep filter to\n", 223 | "explore less conventional ideas. The chunk provides tips for generating startup\n", 224 | "ideas, including looking for gaps in the market, considering personal expertise\n", 225 | "and needs, and talking to others about their unmet needs. The document chunk\n", 226 | "discusses strategies for finding startup ideas. It suggests that founders should\n", 227 | "seek out unsexy or unglamorous ideas that people will pay for. It also\n", 228 | "recommends looking for dying industries or companies that can be replaced by a\n", 229 | "new innovative solution. The document emphasizes the importance of serving a\n", 230 | "small but important market that big players ignore and riding on bigger waves of\n", 231 | "technological advancements. It also mentions the need for founders to have the\n", 232 | "right background and to live in the future by building what seems interesting.\n", 233 | "The document concludes by stating that finding startup ideas is a subtle\n", 234 | "business and that it takes time and observation to notice gaps and anomalies\n", 235 | "that can be turned into interesting projects.\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "print('global summary:', textwrap.fill(auto_global_summary, 80))" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "**The global summary has also improved compared to the naive method!**" 248 | ] 249 | } 250 | ], 251 | "metadata": { 252 | "kernelspec": { 253 | "display_name": "hack", 254 | "language": "python", 255 | "name": "python3" 256 | }, 257 | "language_info": { 258 | "codemirror_mode": { 259 | "name": "ipython", 260 | "version": 3 261 | }, 262 | "file_extension": ".py", 263 | "mimetype": "text/x-python", 264 | "name": "python", 265 | "nbconvert_exporter": "python", 266 | "pygments_lexer": "ipython3", 267 | "version": "3.9.16" 268 | }, 269 | "orig_nbformat": 4 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 2 273 | } 274 | -------------------------------------------------------------------------------- /toy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Optimal Chunk-size for Large Document Summarization\n", 8 | "\n", 9 | "### Toy comparison" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "document: abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n", 22 | "document_size: 10\n", 23 | "max_chunk_size: 5\n", 24 | "Naive chunking size list: [5, 5]\n", 25 | "Auto chunking size list: [5, 5]\n", 26 | "----------------------------------------\n", 27 | "document: abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n", 28 | "document_size: 11\n", 29 | "max_chunk_size: 5\n", 30 | "Naive chunking size list: [5, 5, 1]\n", 31 | "Auto chunking size list: [4, 4, 3]\n", 32 | "----------------------------------------\n", 33 | "document: abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n", 34 | "document_size: 12\n", 35 | "max_chunk_size: 5\n", 36 | "Naive chunking size list: [5, 5, 2]\n", 37 | "Auto chunking size list: [4, 4, 4]\n", 38 | "----------------------------------------\n", 39 | "document: abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n", 40 | "document_size: 13\n", 41 | "max_chunk_size: 5\n", 42 | "Naive chunking size list: [5, 5, 3]\n", 43 | "Auto chunking size list: [5, 4, 4]\n", 44 | "----------------------------------------\n", 45 | "document: abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n", 46 | "document_size: 14\n", 47 | "max_chunk_size: 5\n", 48 | "Naive chunking size list: [5, 5, 4]\n", 49 | "Auto chunking size list: [5, 5, 4]\n", 50 | "----------------------------------------\n", 51 | "document: abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n", 52 | "document_size: 15\n", 53 | "max_chunk_size: 5\n", 54 | "Naive chunking size list: [5, 5, 5]\n", 55 | "Auto chunking size list: [5, 5, 5]\n", 56 | "----------------------------------------\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "from chunker import naive_chunker, auto_chunker, get_token_size\n", 62 | "MODEL='gpt-3.5-turbo'\n", 63 | "\n", 64 | "def comparison(test_doc, chunk_size):\n", 65 | " print('document_size:', get_token_size(test_doc, MODEL))\n", 66 | " print('max_chunk_size:', chunk_size)\n", 67 | " naive_chunk_list=naive_chunker(test_doc, chunk_size, MODEL)\n", 68 | " print('Naive chunking size list: ', [get_token_size(chunk, MODEL) for chunk in naive_chunk_list])\n", 69 | " auto_chunk_list=auto_chunker(test_doc, chunk_size, MODEL)\n", 70 | " print('Auto chunking size list: ', [get_token_size(chunk, MODEL) for chunk in auto_chunk_list])\n", 71 | " print('----------------------------------------')\n", 72 | "\n", 73 | "\n", 74 | "CHUNK_SIZE=5\n", 75 | "single_token_text=\"abcd\"\n", 76 | "\n", 77 | "for i in range(10, 16):\n", 78 | " test_doc=single_token_text*i\n", 79 | " print('document:',test_doc)\n", 80 | " comparison(test_doc, chunk_size=CHUNK_SIZE)" 81 | ] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "hack", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.9.16" 101 | }, 102 | "orig_nbformat": 4 103 | }, 104 | "nbformat": 4, 105 | "nbformat_minor": 2 106 | } 107 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import time 3 | 4 | delimiter = "####" 5 | 6 | MAX_ATTEMPTS = 3 7 | MODEL='gpt-3.5-turbo' 8 | 9 | def ChatGPT_API(messages, openai_key, model): 10 | for attempt in range(MAX_ATTEMPTS): 11 | try: 12 | response = openai.ChatCompletion.create( 13 | api_key=openai_key, 14 | model=model, 15 | messages=messages, 16 | temperature=0, 17 | ) 18 | break 19 | except Exception as error: 20 | print(error) 21 | time.sleep(1) 22 | if attempt == MAX_ATTEMPTS - 1: 23 | return "Server Error" 24 | continue 25 | 26 | return response["choices"][0]["message"]["content"] 27 | 28 | 29 | def get_chunk_summary(content, openai_key, model): 30 | system_msg = f""" 31 | Summarize this document chunk. 32 | reply format:{delimiter}""" 33 | user_msg = 'here is the document chunk:\n' + content 34 | messages = [ 35 | {"role": "system", "content": system_msg}, 36 | {"role": "user", "content": user_msg} 37 | ] 38 | 39 | result = ChatGPT_API( messages, openai_key, model) 40 | return result.split(delimiter)[-1].strip() 41 | 42 | def get_global_summary(list_of_summaries, openai_key, model): 43 | 44 | system_msg = f""" 45 | You are given a a list of summaries, each summary summarizes a chunk of a document in sequence. 46 | Combine a list of summaries into one global summary of the document. 47 | reply format:{delimiter}""" 48 | user_msg = 'here is the list of the summaries:\n' + str(list_of_summaries) 49 | messages = [ 50 | {"role": "system", "content": system_msg}, 51 | {"role": "user", "content": user_msg} 52 | ] 53 | 54 | result = ChatGPT_API(messages, openai_key, model) 55 | # print(result) 56 | return result.split(delimiter)[-1].strip() --------------------------------------------------------------------------------