├── .devcontainer.json ├── .env.example ├── .gitignore ├── README.md ├── data └── transcripts │ ├── ai.txt │ ├── cybersecurity.txt │ ├── lecture0.txt │ ├── lecture1.txt │ ├── lecture2.txt │ ├── lecture3.txt │ ├── lecture4.txt │ ├── lecture5.txt │ ├── lecture6.txt │ ├── lecture7.txt │ ├── lecture8.txt │ └── lecture9.txt ├── examples ├── chat │ ├── anthropic │ │ ├── chat.py │ │ ├── chat0.py │ │ ├── chat1.py │ │ ├── chat2.py │ │ ├── chat3.py │ │ └── chat4.py │ ├── google │ │ ├── chat.py │ │ ├── chat0_1.py │ │ ├── chat0_2.py │ │ ├── chat1_1.py │ │ ├── chat1_2.py │ │ ├── chat2_1.py │ │ ├── chat2_2.py │ │ ├── chat3_1.py │ │ ├── chat3_2.py │ │ └── chat4.py │ └── openai │ │ ├── chat.py │ │ ├── chat0.py │ │ ├── chat1.py │ │ ├── chat2.py │ │ ├── chat3.py │ │ └── chat4.py ├── embeddings │ └── openai │ │ ├── 00_embedding_example.py │ │ ├── 01_create_embeddings.py │ │ ├── 02_search_embeddings.py │ │ ├── 03_qa_with_embeddings_based_search.py │ │ └── create_embedding.sh └── rag │ └── openai │ ├── assistant.py │ └── assistant_stream.py ├── requirements.txt └── utils └── clean_up.py /.devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "customizations": { 3 | "vscode": { 4 | "extensions": [ 5 | "mathematic.vscode-pdf", 6 | "ms-azuretools.vscode-docker", 7 | "ms-toolsai.jupyter", 8 | "ms-python.autopep8", 9 | "ms-python.python", 10 | "ms-vscode.cpptools", 11 | "ms-vscode.hexeditor", 12 | "ms-vsliveshare.vsliveshare", 13 | "plibither8.remove-comments" 14 | ], 15 | "settings": { 16 | "explorer.compactFolders": false, 17 | "editor.minimap.enabled": false, 18 | "editor.quickSuggestionsDelay": 3000, 19 | "emmet.showExpandedAbbreviation": "never", 20 | "files.trimFinalNewlines": false, 21 | "files.trimTrailingWhitespace": true, 22 | "workbench.editor.enablePreview": false, 23 | "workbench.editor.closeOnFileDelete": true, 24 | "workbench.preferredLightColorTheme": "GitHub Dark Default" 25 | } 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | CLAUDE_API_KEY=TODO # https://console.anthropic.com/settings/keys 2 | OPENAI_API_KEY=TODO # https://platform.openai.com/api-keys (note that API key is now project-based) 3 | GEMINI_API_KEY=TODO # https://aistudio.google.com/app/apikey 4 | 5 | # After you've added your API keys to this file, rename .env.example to .env -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.env 2 | *.jsonl 3 | *.vsix 4 | demo/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CS50 Workshop on AI 2 | 3 | This workshop is designed to introduce you to the capabilities of OpenAI's APIs, including Chat Completion, Embedding, and Assistant APIs, with hands-on demonstrations and code examples. 4 | 5 | Slides for this workshop are available [here](https://docs.google.com/presentation/d/11k93gz0mYpSwaB9bvbtofa2o11Pg7Z2_hrH3pB4APQ0/). 6 | 7 | ## Requirements 8 | 9 | - Python 3.x 10 | - OpenAI Python Library (installation guide below) 11 | - OpenAI API Key 12 | - Internet Connection 13 | 14 | ## Installation 15 | 16 | Before we dive into the demos, please ensure your environment is set up with the necessary software and libraries: 17 | 18 | ```bash 19 | # Install the OpenAI library and other dependencies 20 | pip3 install -r requirements.txt 21 | ``` 22 | 23 | ## Demo 1: Chat Completion API 24 | 25 | This demo illustrates how to utilize the Chat Completion API to create an interactive chatbot. 26 | 27 | ### Key Features 28 | 29 | - **System Message**: Sets the context for the AI (e.g., "You are a friendly and supportive teaching assistant for CS50. You are also a cat.") 30 | - **User Interaction**: Accepts user input to simulate a conversation. 31 | - **API Integration**: Utilizes the `chat.completions.create` method to generate responses based on the conversation history. 32 | - **Streaming Responses**: Demonstrates how to handle long-running completions with streaming. 33 | 34 | ## Demo 2: Text Embeddings and Semantic Search 35 | 36 | This demo showcases the use of OpenAI's text embeddings to perform semantic search, enabling the identification of the most relevant information chunk in response to a user query. This technique can significantly enhance the way educational content is queried and retrieved, making it a powerful tool for educators and students alike. 37 | 38 | ### Key Features of Demo 2 39 | 40 | - **Text Embeddings**: Illustrates how to generate and utilize text embeddings using OpenAI's `embeddings.create` method. 41 | - **Semantic Search**: Demonstrates how to compute similarity scores between embeddings to find the most relevant content. 42 | - **Integration with Chat API**: Combines the result of semantic search with the Chat Completion API to generate contextually relevant responses. 43 | 44 | ### Usage Notes 45 | 46 | - **Pre-computed Embeddings**: Before running this demo, ensure you have an `embeddings.jsonl` file containing pre-computed embeddings for various content chunks relevant to your subject matter. 47 | - **Custom Model Selection**: You can experiment with different models for embeddings to suit your content and accuracy requirements. 48 | 49 | ## Demo 3: Assistant API with Custom Data and Context 50 | 51 | This demo showcases how to create an assistant (with a vector store attached) that can utilize specific data files to provide tailored responses. It is particularly useful for creating specialized assistants for events, courses, or research projects. 52 | 53 | ### Key Features 54 | 55 | - **Custom Assistant Creation**: Guides you through creating an assistant tailored to the needs of answering CS50 or computer science-related questions. 56 | - **Data File Utilization**: Demonstrates how to upload and associate data files with your assistant to enrich its responses. 57 | - **Dynamic Interaction**: Engages users in a conversational interface, utilizing the assistant to respond to queries based on the provided data and instructions. 58 | 59 | ### Usage Notes 60 | 61 | - **Data Preparation**: Before running the demo, ensure your `FILES_DIR` points to the directory containing relevant files you wish to use with your assistant. We have pre-configured the use of lecture transcripts in the example. 62 | - **Customization**: You can customize the assistant's name, behavior, and capabilities to fit various educational or research contexts. 63 | -------------------------------------------------------------------------------- /data/transcripts/ai.txt: -------------------------------------------------------------------------------- 1 | [MUSIC PLAYING] TOM CRUISE: I'm going to show you some magic. It's the real thing. 2 | 3 | [LAUGHTER] 4 | 5 | I mean, it's all the real thing. 6 | 7 | [LAUGHTER] 8 | 9 | DAVID J. MALAN: All right. This is CS50, Harvard University's Introduction to the Intellectual Enterprises of Computer Science and the Art of Programming. My name is David Malan, and this is our family-friendly introduction to artificial intelligence or AI, which seems to be everywhere these days. 10 | 11 | But first, a word on these rubber ducks, which your students might have had for some time. Within the world of computer science, and programming in particular, there's this notion of rubber duck debugging or rubber ducking-- 12 | 13 | --whereby in the absence of a colleague, a friend, a family member, a teaching fellow who might be able to answer your questions about your code, especially when it's not working, ideally you might have at least a rubber duck or really any inanimate object on your desk with whom to talk. 14 | 15 | And the idea is, that in expressing your logic, talking through your problems, even though the duck doesn't actually respond, invariably, you hear eventually the illogic in your thoughts and the proverbial light bulb goes off. 16 | 17 | Now, for students online for some time, CS50 has had a digital version thereof, whereby in the programming environment that CS50 students use, for the past several years, if they don't have a rubber duck on their desk, they can pull up this interface here. 18 | 19 | And if they begin a conversation like, I'm hoping you can help me solve some problem, up until recently, CS50's virtual rubber duck would simply quack once, twice, or three times in total. But we have anecdotal evidence that alone was enough to get students to realize what it is they were doing wrong. 20 | 21 | But of course, more recently has this duck and so many other ducks, so to speak, around the world, come to life really. And your students have been using artificial intelligence in some form within CS50 as a virtual teaching assistant. 22 | 23 | And what we'll do today, is reveal not only how we've been using and leveraging AI within CS50, but also how AI itself works, and to prepare you better for the years ahead. So last year around this time, like DALL-E 2 and image generation were all of the rage. 24 | 25 | You might have played with this, whereby you can type in some keywords and boom, you have a dynamically generated image. Similar tools are like Midjourney, which gives you even more realistic 3D imagery. And within that world of image generation, there were nonetheless some tells, like an observant viewer could tell that this was probably generated by AI. 26 | 27 | And in fact, a few months ago, The New York Times took a look at some of these tools. And so, for instance, here is a sequence of images that at least at left, isn't all that implausible that this might be an actual photograph. But in fact, all three of these are AI-generated. 28 | 29 | And for some time, there was a certain tell. Like AI up until recently, really wasn't really good at the finer details, like the fingers are not quite right. And so you could have that sort of hint. But I dare say, AI is getting even better and better, such that it's getting harder to discern these kinds of things. 30 | 31 | So if you haven't already, go ahead and take out your phone if you have one with you. And if you'd like to partake, scan this barcode here, which will lead you to a URL. And on your screen, you'll have an opportunity in a moment to buzz in. If my colleague, Rongxin, wouldn't mind joining me up here on stage. 32 | 33 | We'll ask you a sequence of questions and see just how prepared you are for this coming world of AI. So for instance, once you've got this here, code scanned, if you don't, that's fine. You can play along at home or alongside the person next to you. 34 | 35 | Here are two images. And my question for you is, which of these two images, left or right, was generated by AI? Which of these two was generated by AI, left or right? And I think Rongxin, we can flip over and see as the responses start to come in. 36 | 37 | So far, we're about 20% saying left, 70 plus percent saying right. 3%, 4%, comfortably admitting unsure, and that's fine. Let's wait for a few more responses to come in, though I think the right-hand folks have it. And let's go ahead and flip back and see what the solution is. 38 | 39 | In this case, it was, in fact, the right-hand side that was AI-generated. So, that's great. I'm not sure what it means that we figured this one out, but let's try one more here. So let me propose that we consider now these two images. 40 | 41 | It's the same code. So if you still have your phone up, you don't need to scan again. It's going to be the same URL here. But just in case you closed it. Let's take a look now at these two images. Which of these, left or right, was AI-generated? Left or right this time? 42 | 43 | Rongxin, should we take a look at how it's coming in? Oh, it's a little closer this time. Left or right? Right's losing a little ground, maybe as people are changing their answers to left. More people are unsure this time, which is somewhat revealing. Let's give folks another second or two. 44 | 45 | And Rongxin, should we flip back? The answer is, actually a trick question, since they were both AI. So most of you, most of you were, in fact, right. But if you take a glance at this, is getting really, really good. 46 | 47 | And so this is just a taste of the images that we might see down the line. And in fact, that video with which we began, Tom Cruise, as you might have gleaned, was not, in fact, Tom Cruise. 48 | 49 | That was an example of a deepfake, a video that was synthesized, whereby a different human was acting out those motions, saying those words, but software, artificial intelligence-inspired software was mutating the actual image and faking this video. 50 | 51 | So it's all fun and games for now as we tinker with these kinds of examples, but suffice it to say, as we've begun to discuss in classes like this already, disinformation is only going to become more challenging in a world where it's not just text, but it's imagery. And all the more, soon video. 52 | 53 | But for today, we'll focus really on the fundamentals, what it is that's enabling technologies like these, and even more familiarly, text generation, which is all the rage. And in fact, it seems just a few months ago, probably everyone in this room started to hear about tools like ChatGPT. 54 | 55 | So we thought we'd do one final exercise here as a group. And this was another piece in The New York Times where they asked the audience, "Did a fourth grader write this? Or the new chatbot?" So another opportunity to assess your discerning skills. 56 | 57 | So same URL. So if you still have your phone open and that same interface open, you're in the right place. And here, we'll take a final stab at two essays of sorts. Which of these essays was written by AI? Essay 1 or Essay 2? 58 | 59 | And as folks buzz in, I'll read the first. Essay 1. I like to bring a yummy sandwich and a cold juice box for lunch. Sometimes I'll even pack a tasty piece of fruit or a bag of crunchy chips. As we eat, we chat and laugh and catch up on each other's day, dot, dot, dot. 60 | 61 | Essay 2. My mother packs me a sandwich, a drink, fruit, and a treat. When I get in the lunchroom, I find an empty table and sit there and I eat my lunch. My friends come and sit down with me. Dot, dot, dot. Rongxin, should we see what folks think? 62 | 63 | It looks like most of you think that Essay 1 was generated by AI. And in fact, if we flip back to the answer here, it was, in fact, Essay 1. So it's great that we now already have seemingly this discerning eye, but let me perhaps deflate that enthusiasm by saying it's only going to get harder to discern one from the other. And we're really now on the bleeding edge of what's soon to be possible. 64 | 65 | But most everyone in this room has probably by now seen, tried, certainly heard of ChatGPT, which is all about textual generation. Within CS50 and within academia more generally, have we been thinking about, talking about, how whether to use or not use these kinds of technologies. 66 | 67 | And if the students in the room haven't told the family members in the room already, this here is an excerpt from CS50's own syllabus this year, whereby we have deemed tools like ChatGPT in their current form, just too helpful. Sort of like an overzealous friend who in school, who just wants to give you all of the answers instead of leading you to them. 68 | 69 | And so we simply prohibit by policy using AI-based software, such as ChatGPT, third-party tools like GitHub Copilot, Bing Chat, and others that suggests or completes answers to questions or lines of code. But it would seem reactionary to take away what technology surely has some potential upsides for education. 70 | 71 | And so within CS50 this semester, as well as this past summer, have we allowed students to use CS50's own AI-based software, which are in effect, as we'll discuss, built on top of these third-party tools, ChatGPT from OpenAI, companies like Microsoft and beyond. 72 | 73 | And in fact, what students can now use, is this brought to life CS50 duck, or DDB, Duck Debugger, within a website of our own, CS50 AI, and another that your students know known as cs50.dev. 74 | 75 | So students are using it, but in a way where we have tempered the enthusiasm of what might otherwise be an overly helpful duck to model it more akin to a good teacher, a good teaching fellow, who might guide you to the answers, but not simply hand them outright. 76 | 77 | So what does that actually mean, and in what form does this duck come? Well, architecturally, for those of you with engineering backgrounds that might be curious as to how this is actually implemented, if a student here in the class has a question, virtually in this case, they somehow ask these questions of this central web application, cs50.ai. 78 | 79 | But we, in turn, have built much of our own logic on top of third-party services, known as APIs, application programming interfaces, features that other companies provide that people like us can use. So as they are doing really a lot of the heavy lifting, the so-called large language models are there. 80 | 81 | But we, too, have information that is not in these models yet. For instance, the words that came out of my mouth just last week when we had a lecture on some other topic, not to mention all of the past lectures and homework assignments from this year. 82 | 83 | So we have our own vector database locally via which we can search for more recent information, and then hand some of that information into these models, which you might recall, at least for OpenAI, is cut off as of 2021 as of now, to make the information even more current. 84 | 85 | So architecturally, that's sort of the flow. But for now, I thought I'd share at a higher level what it is your students are already familiar with, and what will soon be more broadly available to our own students online as well. 86 | 87 | So what we focused on is, what's generally now known as prompt engineering, which isn't really a technical phrase, because it's not so much engineering in the traditional sense. It really is just English, what we are largely doing when it comes to giving the AI the personality of a good teacher or a good duck. 88 | 89 | So what we're doing, is giving it what's known as a system prompt nowadays, whereby we write some English sentences, send those English sentences to OpenAI or Microsoft, that sort of teaches it how to behave. Not just using its own knowledge out of the box, but coercing it to behave a little more educationally constructively. 90 | 91 | And so for instance, a representative snippet of English that we provide to these services looks a little something like this. Quote, unquote, "You are a friendly and supportive teaching assistant for CS50. You are also a rubber duck. You answer student questions only about CS50 and the field of computer science, do not answer questions about unrelated topics. Do not provide full answers to problem sets, as this would violate academic honesty. 92 | 93 | And so in essence, and you can do this manually with ChatGPT, you can tell it or ask it how to behave. We, essentially, are doing this automatically, so that it doesn't just hand answers out of the box and knows a little something more about us. 94 | 95 | There's also in this world of AI right now the notion of a user prompt versus that system prompt. And the user prompt, in our case, is essentially the student's own question. I have a question about x, or I have a problem with my code here in y, so we pass to those same APIs, students' own questions as part of this so-called user prompt. Just so you're familiar now with some of the vernacular of late. 96 | 97 | Now, the programming environment that students have been using this whole year is known as Visual Studio Code, a popular open source, free product, that most-- so many engineers around the world now use. But we've instrumented it to be a little more course-specific with some course-specific features that make learning within this environment all the easier. 98 | 99 | It lives at cs50.dev. And as students in this room know, that as of now, the virtual duck lives within this environment and can do things like explain highlighted lines of code. So here, for instance, is a screenshot of this programming environment. Here is some arcane looking code in a language called C, that we've just left behind us in the class. 100 | 101 | And suppose that you don't understand what one or more of these lines of code do. Students can now highlight those lines, right-click or Control click on it, select explain highlighted code, and voila, they see a ChatGPT-like explanation of that very code within a second or so, that no human has typed out, but that's been dynamically generated based on this code. 102 | 103 | Other things that the duck can now do for students is advise students on how to improve their code style, the aesthetics, the formatting thereof. And so for instance, here is similar code in a language called C. And I'll stipulate that it's very messy. Everything is left-aligned instead of nicely indented, so it looks a little more structured. 104 | 105 | Students can now click a button. They'll see at the right-hand side in green how their code should ideally look. And if they're not quite sure what those changes are or why, they can click on, explain changes. And similarly, the duck advises them on how and why to turn their not great code into greater code, from left to right respectively. 106 | 107 | More compellingly and more generalizable beyond CS50 and beyond computer science, is AI's ability to answer most of the questions that students might now ask online. And we've been doing asynchronous Q&A for years via various mobile or web applications and the like. 108 | 109 | But to date, it has been humans, myself included, responding to all of those questions. Now the duck has an opportunity to chime in, generally within three seconds, because we've integrated it into an online Q&A tool that students in CS50 and elsewhere across Harvard have long used. 110 | 111 | So here's an anonymized screenshot of a question from an actual student, but written here as John Harvard, who asked this summer, in the summer version of CS50, what is flask exactly? So fairly definitional question. And here is what the duck spit out, thanks to that architecture I described before. 112 | 113 | I'll stipulate that this is correct, but it is mostly a definition, akin to what Google or Bing could already give you last year. But here's a more nuanced question, for instance, from another anonymized student. In this question here, the student's including an error message that they're seeing. They're asking about that. 114 | 115 | And they're asking a little more broadly and qualitatively, is there a more efficient way to write this code, a question that really is best answered based on experience. Here, I'll stipulate that the duck responded with this answer, which is actually pretty darn good. 116 | 117 | Not only responding in English, but with some sample starter code that would make sense in this context. And at the bottom it's worth noting, because none of this technology is perfect just yet, it's still indeed very bleeding edge, and so what we have chosen to do within CS50 is include disclaimers, like this. 118 | 119 | I am an experimental bot, quack. Do not assume that my reply is accurate unless you see that it's been endorsed by humans, quack. And in fact, at top right, the mechanism we've been using in this tool is usually within minutes. 120 | 121 | A human, whether it's a teaching fellow, a course assistant, or myself, will click on a button like this to signal to our human students that yes, like the duck is spot on here, or we have an opportunity, as always, to chime in with our own responses. 122 | 123 | Frankly, that disclaimer, that button, will soon I do think go away, as the software gets better and better. But for now, that's how we're modulating exactly what students' expectations might be when it comes to correctness or incorrectness. 124 | 125 | It's common too in programming, to see a lot of error messages, certainly when you're learning first-hand. A lot of these error messages are arcane, confusing, certainly to students, versus the people who wrote them. 126 | 127 | Soon students will see a box like this. Whenever one of their terminal window programs errs, they'll be assisted too with English-like, TF-like support when it comes to explaining what it is that went wrong with that command. 128 | 129 | And ultimately, what this is really doing for students in our own experience already, is providing them really with virtual office hours and 24/7, which is actually quite compelling in a university environment, where students' schedules are already tightly packed, be it with academics, their curriculars, athletics, and the like-- 130 | 131 | --and they might have enough time to dive into a homework assignment, maybe eight hours even, for something sizable. But if they hit that wall a couple of hours in, yeah, they can go to office hours or they can ask a question asynchronously online, but it's really not optimal in the moment support that we can now provide all the more effectively we hope, through software, as well. 132 | 133 | So if you're curious. Even if you're not a technophile yourself, anyone on the internet can go to cs50.ai and experiment with this user interface. This one here actually resembles ChatGPT itself, but it's specific to CS50. 134 | 135 | And here again is just a sequence of screenshots that I'll stipulate for today's purposes, are pretty darn good in akin to what I myself or a teaching fellow would reply to and answer to a student's question, in this case, about their particular code. 136 | 137 | And ultimately, it's really aspirational. The goal here ultimately is to really approximate a one-to-one teacher to student ratio, which despite all of the resources we within CS50, we within Harvard and places like Yale have, we certainly have never had enough resources to approximate what might really be ideal, which is more of an apprenticeship model, a mentorship, whereby it's just you and that teacher working one-to-one. 138 | 139 | Now we still have humans, and the goal is not to reduce that human support, but to focus it all the more consciously on the students who would benefit most from some impersonal one-to-one support versus students who would happily take it at any hour of the day more digitally via online. 140 | 141 | And in fact, we're still in the process of evaluating just how well or not well all of this works. But based on our summer experiment alone with about 70 students a few months back, one student wrote us at term's end it-- 142 | 143 | --"felt like having a personal tutor. I love how AI bots will answer questions without ego and without judgment. Generally entertaining even the stupidest of questions without treating them like they're stupid. It has an, as one could expect," ironically, "an inhuman level of patience." And so I thought that's telling as to how even one student is perceiving these new possibilities. 144 | 145 | So let's consider now more academically what it is that's enabling those kinds of tools, not just within CS50, within computer science, but really, the world more generally. What the whole world's been talking about is generative artificial intelligence. AI that can generate images, generate text, and sort of mimic the behavior of what we think of as human. 146 | 147 | So what does that really mean? Well, let's start really at the beginning. Artificial intelligence is actually a technique, a technology, a subject that's actually been with us for some time, but it really was the introduction of this very user-friendly interface known as ChatGPT. 148 | 149 | And some of the more recent academic work over really just the past five or six years, that really allowed us to take a massive leap forward it would seem technologically, as to what these things can now do. 150 | 151 | So what is artificial intelligence? It's been with us for some time, and it's honestly, so omnipresent, that we take it for granted nowadays. Gmail, Outlook, have gotten really good at spam detection. If you haven't checked your spam folder in a while, that's testament to just how good they seem to be at getting it out of your inbox. 152 | 153 | Handwriting recognition has been with us for some time. I dare say, it, too, is only getting better and better the more the software is able to adapt to different handwriting styles, such as this. 154 | 155 | Recommendation histories and the like, whether you're using Netflix or any other service, have gotten better and better at recommending things you might like based on things you have liked, and maybe based on things other people who like the same thing as you might have liked. 156 | 157 | And suffice it to say, there's no one at Netflix akin to the old VHS stores of yesteryear, who are recommending to you specifically what movie you might like. And there's no code, no algorithm that says, if they like x, then recommend y, else recommend z, because there's just too many movies, too many people, too many different tastes in the world. 158 | 159 | So AI is increasingly sort of looking for patterns that might not even be obvious to us humans, and dynamically figuring out what might be good for me, for you or you, or anyone else. Siri, Google Assistant, Alexa, any of these voice recognition tools that are answering questions. That, too, suffice it to say, is all powered by AI. 160 | 161 | But let's start with something a little simpler than any of those applications. And this is one of the first arcade games from yesteryear known as Pong. And it's sort of like table tennis. And the person on the left can move their paddle up and down. Person on the right can do the same. 162 | 163 | And the goal is to get the ball past the other person, or conversely, make sure it hits your paddle and bounces back. Well, somewhat simpler than this insofar as it can be one player, is another Atari game from yesteryear known as Breakout, whereby you're essentially just trying to bang the ball against the bricks to get more and more points and get rid of all of those bricks. 164 | 165 | But all of us in this room probably have a human instinct for how to win this game, or at least how to play this game. For instance, if the ball pictured here back in the '80s as a single red dot just left the paddle, pictured here as a red line, where is the ball presumably going to go next? And in turn, which direction should I slide my paddle? To the left or to the right? 166 | 167 | So presumably, to the left. And we all have an eye for what seemed to be the digital physics of that. And indeed, that would then be an algorithm, sort of step by step instructions for solving some problem. So how can we now translate that human intuition to what we describe more as artificial intelligence? 168 | 169 | Not nearly as sophisticated as those other applications, but we'll indeed, start with some basics. You might know from economics or strategic thinking or computer science, this idea of a decision tree that allows you to decide, should I go this way or this way when it comes to making a decision. 170 | 171 | So let's consider how we could draw a picture to represent even something simplistic like Breakout. Well, if the ball is left of the paddle, is a question or a Boolean expression I might ask myself in code. If yes, then I should move my paddle left, as most everyone just said. 172 | 173 | Else, if the ball is not left of paddle, what do I want to do? Well, I want to ask a question. I don't want to just instinctively go right. I want to check, is the ball to the right of the paddle, and if yes, well, then yes, go ahead and move the paddle right. 174 | 175 | But there is a third situation, which is-- 176 | 177 | AUDIENCE: [INAUDIBLE] 178 | 179 | DAVID J. MALAN: Right. Like, don't move, it's coming right at you. So that would be the third scenario here. No, it's not to the right or to the left, so just don't move the paddle. You got lucky, and it's coming, for instance, straight down. 180 | 181 | So Breakout is fairly straightforward when it comes to an algorithm. And we can actually translate this as any CS50 student now could, to code or pseudocode, sort of English-like code that's independent of Java, C, C++ and all of the programming languages of today. 182 | 183 | So in English pseudocode, while a game is ongoing, if the ball is left of paddle, I should move paddle left. Else if ball is right of the paddle, it should say paddle, that's a bug, not intended today, move paddle right. Else, don't move the paddle. 184 | 185 | So that, too, represents a translation of this intuition to code that's very deterministic. You can anticipate all possible scenarios captured in code. And frankly, this should be the most boring game of Breakout, because the paddle should just perfectly play this game, assuming there's no variables or randomness when it comes to speed or angles or the like, which real world games certainly try to introduce. 186 | 187 | But let's consider another game from yesteryear that you might play with your kids today or you did yourself growing up. Here's tic-tac-toe. And for those unfamiliar, the goal is to get three O's in a row or three X's in a row, vertically, horizontally, or diagonally. 188 | 189 | So suppose it's now X's turn. If you've played tic-tac-toe, most of you probably just have an immediate instinct as to where X should probably go, so that it doesn't lose instantaneously. But let's consider in the more general case, how do you solve tic-tac-toe. 190 | 191 | Frankly, if you're in the habit of losing tic-tac-toe, but you're not trying to lose tic-tac-toe, you're actually playing it wrong. Like, you should minimally be able to always force a tie in tic-tac-toe. And better yet, you should be able to beat the other person. So hopefully, everyone now will soon walk away with this strategy. 192 | 193 | So how can we borrow inspiration from those same decision trees and do something similar here? So if you, the player, ask yourself, can I get three in a row on this turn? Well, if yes, then you should do that and play the X in that position. Play in the square to get three in a row. Straight forward. 194 | 195 | If you can't get three in a row in this turn, you should ask another question. Can my opponent get three in a row in their next turn? Because then you better preempt that by moving into that position. Play in the square to block opponent's three in a row. 196 | 197 | What if though, that's not the case, right? What if there aren't even that many X's and O's on the board? If you're in the habit of just kind of playing randomly, like you might not be playing optimally as a good AI could. 198 | 199 | So if no, it's kind of a question mark. In fact, there's probably more to this tree, because we could think through, what if I go there. Wait a minute, what if I go there or there or there? You can start to think a few steps ahead as a computer could do much better even than us humans. 200 | 201 | So suppose, for instance, it's O's turn. Now those of you who are very good at tic-tac-toe might have an instinct for where to go. But this is an even harder problem, it would seem. I could go in eight possible places if I'm O. But let's try to break that down more algorithmically, as in AI would. 202 | 203 | And let's recognize, too, that with games in particular, one of the reasons that AI was so early adopted in these games, playing the CPU, is that games really lend themselves to defining them, if taking the fun out of it mathematically. Defining them in terms of inputs and outputs, maybe paddle moving left or right, ball moving up or down. 204 | 205 | You can really quantize it at a very boring low level. But that lends itself then to solving it optimally. And in fact, with most games, the goal is to maximize or maybe minimize some math function, right? Most games, if you have scores, the goal is to maximize your score, and indeed, get a high score. So games lend themselves to a nice translation to mathematics, and in turn here, AI solutions. 206 | 207 | So one of the first algorithms one might learn in a class on algorithms and on artificial intelligence is something called minimax, which alludes to this idea of trying to minimize and/or maximize something as your function, your goal. 208 | 209 | And it actually derives its inspiration from these same decision trees that we've been talking about. But first, a definition. Here are three representative tic-tac-toe boards. Here is one in which O has clearly won, per the green. Here is one in which X has clearly won, per the green. And this one in the middle just represents a draw. 210 | 211 | Now, there's a bunch of other ways that tic-tac-toe could end, but here's just three representative ones. But let's make tic-tac-toe even more boring than it might have always struck you as. Let's propose that this kind of configuration should have a score of negative 1. If O wins, it's a negative 1. If X wins, it's a positive 1. And if no one wins, we'll call it a 0. 212 | 213 | We need some way of talking about and reasoning about which of these outcomes is better than the other. And what's simpler than 0, 1 and negative 1? So the goal though, of X, it would seem, is to maximize its score, but the goal of O is to minimize its score. So X is really trying to get positive 1, O is really trying to get negative 1. 214 | 215 | And no one really wants 0, but that's better than losing to the other person. So we have now a way to define what it means to win or lose. Well, now we can employ a strategy here. Here, just as a quick check, what would the score be of this board? Just so everyone's on the same page. 216 | 217 | AUDIENCE: 1. 218 | 219 | DAVID J. MALAN: Or, so 1, because X has one and we just stipulated arbitrarily, this means that this board has a value of 1. Now let's put it into a more interesting context. Here, a game has been played for a few moves already. There's two spots left. No one has won just yet. And suppose that it's O's turn now. 220 | 221 | Now, everyone probably has an instinct already as to where to go, but let's try to break this down more algorithmically. So what is the value of this board? Well, we don't know yet, because no one has won, so let's consider what could happen next. 222 | 223 | So we can draw this actually as a tree, as before. Here, for instance, is what might happen if O goes into the top left-hand corner. And here's what might happen if O goes into the bottom middle spot instead. We should ask ourselves, what's the value of this board, what's the value of this board? 224 | 225 | Because if O's purpose in life is to minimize its score, it's going to go left or right based on whichever yields the smallest number. Negative 1, ideally. But we're still not sure yet, because we don't have definitions for boards with holes in them like this. 226 | 227 | So what could happen next here? Well, it's obviously going to be X's turn next. So if X moves, unfortunately, X has one in this configuration. We can now conclude that the value of this board is what number? 228 | 229 | AUDIENCE: 1. 230 | 231 | DAVID J. MALAN: So 1. And because there's only one way to reach this board, by transitivity, you might as well think of the value of this previous board as also 1, because no matter what, it's going to lead to that same outcome. 232 | 233 | And so the value of this board is actually still to be determined, because we don't know if O is going to want to go with the 1, and probably not, because that means X wins. But let's see what the value of this board is. 234 | 235 | Well, suppose that indeed, X goes in that top left corner here. What's the value of this board here? 0, because no one has won. There's no X's or O's three in a row. So the value of this board is 0. There's only one way logically to get there, so we might as well think of the value of this board as also 0. 236 | 237 | And so now, what's the value of this board? Well, if we started the story by thinking about O's turn, O's purpose is the min in minimax, then which move is O going to make? Go to the left or go to the right? 238 | 239 | O's was probably going to go to the right and make the move that leads to, whoops, that leads to this board, because even though O can't win in this configuration, at least X didn't win. So it's minimized its score relatively, even though it's not a clean win. 240 | 241 | Now, this is all fine and good for a configuration of the board that's like almost done. There's only two moves left. The game's about to end. But if you kind of expand in your mind's eye, how did we get to this branch of the decision tree, if we rewind one step where there's three possible moves, frankly, the decision tree is a lot bigger. 242 | 243 | If we rewind further in your mind's eye and have four moves left or five moves or all nine moves left, imagine just zooming out, out, and out. This is becoming a massive, massive tree of decisions. Now, even so, here is that same subtree, the same decision tree we just looked at. 244 | 245 | This is the exact same thing, but I shrunk the font so that it appears here on the screen here. But over here, we have what could happen if instead, it's actually X's turn, because we're one move prior. There's a bunch of different moves X could now make, too. 246 | 247 | So what is the implication of this? Well, most humans are not thinking through tic-tac-toe to this extreme. And frankly, most of us probably just don't have the mental capacity to think about going left and then right and then left and then right. Right? This is not how people play tic-tac-toe. Like, we're not using that much memory, so to speak. 248 | 249 | But a computer can handle that, and computers can play tic-tac-toe optimally. So if you're beating a computer at tic-tac-toe, like, it's not implemented very well. It's not following this very logical, deterministic minimax algorithm. 250 | 251 | But this is where now AI is no longer as simple as just doing what these decision trees say. In the context of tic-tac-toe, here's how we might translate this to code, for instance. If player is X, for each possible move, calculate a score for the board, as we were doing verbally, and then choose the move with the highest score. Because X's goal is to maximize its score. 252 | 253 | If the player is O, though, for each possible move, calculate a score for the board, and then choose the move with the lowest score. So that's a distillation of that verbal walkthrough into what CS50 students know now as code, or at least pseudocode. 254 | 255 | But the problem with games, not so much tic-tac-toe, but other more sophisticated games is this. Does anyone want to ballpark how many possible ways there are to play tic-tac-toe? Paper, pencil, two human children, how many different ways? How long could you keep them occupied playing tic-tac-toe in different ways? 256 | 257 | If you actually think through, how big does this tree get, how many leaves are there on this decision tree, like how many different directions, well, if you're thinking 255,168, you are correct. And now most of us in our lifetime have probably not played tic-tac-toe that many times. 258 | 259 | So think about how many games you've been missing out on. There are different decisions you could have been making all these years. Now, that's a big number, but honestly, that's not a big number for a computer. 260 | 261 | That's a few megabytes of memory maybe, to keep all of that in mind and implement that kind of code in C or Java or C++ or something else. But other games are much more complicated. And the games that you and I might play as we get older, they include maybe chess. 262 | 263 | And if you think about chess with only the first four moves, back and forth four times, so only four moves. That's not even a very long game. Anyone want a ballpark how many different ways there are to begin a game of chess with four moves back and forth? 264 | 265 | This is evidence as to why chess is apparently so hard. 288 million ways, which is why when you are really good at chess, you are really good at chess. Because apparently, you either have an intuition for or a mind for thinking it would seem so many more steps ahead than your opponent. 266 | 267 | And don't get us started on something like Go. 266 quintillion ways to play Go's first four moves. So at this point, we just can't pull out our Mac, our PC, certainly not our phone, to solve optimally games like chess and Go, because we don't have big enough CPUs. We don't have enough memory. We don't have enough years in our lifetimes for the computers to crunch all of those numbers. 268 | 269 | And thus was born a different form of AI that's more inspired by finding patterns more dynamically, learning from data, as opposed to being told by humans, here is the code via which to solve this problem. So machine learning is a subset of artificial intelligence that tries instead to get machines to learn what they should do without being so coached step by step by step by humans here. 270 | 271 | Reinforcement learning, for instance, is one such example thereof, wherein reinforcement learning, you sort of wait for the computer or maybe a robot to maybe just get better and better and better at things. And as it does, you reward it with a reward function. Give it plus 1 every time it does something well. And maybe minus 1. You punish it any time it does something poorly. 272 | 273 | And if you simply program this AI or this robot to maximize its score, never mind minimizing, maximize its score, ideally, it should repeat behaviors that got it plus 1. It should decrease the frequency with which it does bad behaviors that got it negative 1. And you can reinforce this kind of learning. 274 | 275 | In fact, I have here one demonstration. Could a student come on up who does not think they are particularly coordinated? If-- OK, wow, you're being nominated by your friends. Come on up. Come on up. 276 | 277 | [LAUGHTER] 278 | 279 | Their hands went up instantly for you. OK, what is your name? 280 | 281 | AMAKA: My name's Amaka. 282 | 283 | DAVID J. MALAN: Amaka, do you want to introduce yourself to the world? 284 | 285 | AMAKA: Hi, my name is Amaka. I am a first year in Holworthy. I'm planning to concentrate in CS. 286 | 287 | DAVID J. MALAN: Wonderful. Nice to see you. Come on over here. 288 | 289 | [APPLAUSE] 290 | 291 | So, yes, oh, no, it's sort of like a game show here. We have a pan here with what appears to be something pancake-like. And we'd like to teach you how to flip a pancake, so that when you gesture upward, the pancake should flip around as though you cooked the other side. So we're going to reward you verbally with plus 1 or minus 1. 292 | 293 | Minus 1. Minus 1. OK, plus 1! Plus 1, so do more of that. Minus 1. Minus 1. Minus 1. Do less of that. 294 | 295 | [LAUGHTER] 296 | 297 | AUDIENCE: Great, great. 298 | 299 | DAVID J. MALAN: All right! A big round of applause. 300 | 301 | [APPLAUSE] 302 | 303 | Thank you. We've been in the habit of handing out Super Mario Brothers Oreos this year, so thank you for participating. 304 | 305 | [APPLAUSE] 306 | 307 | So, this is actually a good example of an opportunity for reinforcement learning. And wonderfully, a researcher has posted a video that we thought we'd share. It's about a minute and a half long, where you can watch a robot now do exactly what our wonderful human volunteer here just attempted as well. 308 | 309 | So let me go ahead and play this on the screen and give you a sense of what the human and the robot are doing together. So their pancake looks a little similar there. The human here is going to first sort of train the robot what to do by showing it some gestures. But there's no one right way to do this. 310 | 311 | But the human seems to know how to do it pretty well in this case, and so it's trying to give the machine examples of how to flip a pancake successfully. But now, this is the very first trial. OK, look familiar? You're in good company. 312 | 313 | After three trials. 314 | 315 | [CLANG] 316 | 317 | [PLOP] 318 | 319 | OK. 320 | 321 | [CLANG] 322 | 323 | [PLOP] 324 | 325 | OK. Now 10 tries. There's the human picking up the pancake. After 11 trials-- 326 | 327 | [CLANG] 328 | 329 | [PLOP] 330 | 331 | And meanwhile, there's presumably a human coding this, in the sense that someone is saying good job or bad job, plus 1 or minus 1. 20 trials. Here now we'll see how the computer knows what it's even doing. There's just a mapping to some kind of XYZ coordinate system. So the robot can quantize what it is it's doing. Nice! To do more of one thing, less of another. 332 | 333 | And you're just seeing a visualization in the background of those digitized movements. And so now, after 50 some odd trials, the robot, too, has got it spot on. And it should be able to repeat this again and again and again, in order to keep flipping this pancake. 334 | 335 | So our human volunteer wonderfully took you even fewer trials. But this is an example then, to be clear, of what we'd call reinforcement learning, whereby you're reinforcing a behavior you want or negatively reinforcing. That is, punishing a behavior that you don't. 336 | 337 | Here's another example that brings us back into the realm of games a little bit, but in a very abstract way. If we were playing a game like The Floor Is Lava, where you're only supposed to step certain places so that you don't fall straight in the lava pit or something like that and lose a point or lose a life, each of these squares might represent a position. 338 | 339 | This yellow dot might represent the human player that can go up, down, left or right within this world. I'm revealing to the whole audience where the lava pits are. But the goal for this yellow dot is to get to green. But the yellow dot, as in any good game, does not have this bird's eye view and knows from the get-go exactly where to go. It's going to have to try some trial and error. 340 | 341 | But if we, the programmers, maybe reinforce good behavior or punish bad behavior, we can teach this yellow dot, without giving it step by step, up, down, left, right instructions, what behaviors to repeat and what behaviors not to repeat. 342 | 343 | So, for instance, suppose the robot moves right. Ah, that was bad. You fell in the lava already, so we'll use a bit of computer memory to draw a thicker red line there. Don't do that again. So, negative 1, so to speak. Maybe the yellow dot moves up next time. We can reward that behavior by not drawing any walls and allowing it to go again. 344 | 345 | It's making pretty good progress, but, oh, darn it, it took a right turn and now fell into the lava. But let's use a bit more of the computer's memory and keep track of the, OK, do not do that thing anymore. Maybe the next time the human dot goes this way. Oh, we want to punish that behavior, so we'll remember as much with that red line. 346 | 347 | But now we're starting to make progress until, oh, now we hit this one. And eventually, even though the yellow dot, much like our human, much like our pancake flipping robot had to try again and again and again, after enough trials, it's going to start to realize what behaviors it should repeat and which ones it shouldn't. 348 | 349 | And so in this case, maybe it finally makes its way up to the green dot. And just to recap, once it finds that path, now it can remember it forever as with these green thicker lines. Any time you want to leave this map, any time you get really good at the Nintendo game, you follow that same path again and again, so you don't fall into the lava. 350 | 351 | But an astute human observer might realize that, yes, this is correct. It's getting out of this so-called maze. But what is suboptimal or bad about this solution? Sure. 352 | 353 | AUDIENCE: It's taking a really long time. It's not the most efficient way to get there. 354 | 355 | DAVID J. MALAN: Exactly. It's taking a really long time. An inefficient way to get there, because I dare say, if we just tried a different path occasionally, maybe we could get lucky and get to the exit quicker. And maybe that means we get a higher score or we get rewarded even more. 356 | 357 | So within a lot of artificial intelligence algorithms, there's this idea of exploring versus exploiting, whereby you should occasionally, yes, exploit the knowledge you already have. And in fact, frequently exploit that knowledge. 358 | 359 | But occasionally you know what you should probably do, is explore just a little bit. Take a left instead of a right and see if it leads you to the solution even more quickly. And you might find a better and better solution. So here mathematically is how we might think of this. 360 | 361 | 10% of the time we might say that epsilon, just some variable, sort of a sprinkling of salt into the algorithm here, epsilon will be like 10% of the time. So if my robot or my player picks a random number that's less than 10%, that's going to make a random move. Go left instead of right, even if you really typically go right. Otherwise, guys make the move with the highest value, as we've learned over time. 362 | 363 | And what the robot might learn then, is that we could actually go via this path, which gets us to the output faster. We get a higher score, we do it in less time, it's a win-win. Frankly, this really resonates with me, because I've been in the habit, as maybe some of you are, when you go to a restaurant maybe that you really like, you find a dish you really like-- 364 | 365 | --I will never again know what other dishes that restaurant offers, because I'm locally optimally happy with the dish I've chosen. And I will never know if there's an even better dish at that restaurant unless again, I sort of sprinkle a little bit of epsilon, a little bit of randomness into my game playing, my dining out. 366 | 367 | The catch, of course, though, is that I might be punished. I might, therefore, be less happy if I pick something and I don't like it. So there's this tension between exploring and exploiting. But in general in computer science, and especially in AI, adding a little bit of randomness, especially over time, can, in fact, yield better and better outcomes. 368 | 369 | But now there's this notion all the more of deep learning, whereby you're trying to infer, to detect patterns, figure out how to solve problems, even if the AI has never seen those problems before, and even if there's no human there to reinforce positive or negatively behavior. Maybe it's just too complex of a problem for a human to stand alongside the robot and say, good or bad job. 370 | 371 | So with deep learning, they're actually very much related to what you might know as neural networks, inspired by human physiology, whereby inside of our brains and elsewhere in our body, there's lots of these neurons here that can send electrical signals to make movements happen from brain to extremities. 372 | 373 | You might have two of these via which signals can be transmitted over a larger distance. And so computer scientists for some time have drawn inspiration from these neurons to create in software, what we call neural networks. 374 | 375 | Whereby, there's inputs to these networks and there's outputs from these networks that represents inputs to problems and solutions thereto. So let me abstract away the more biological diagrams with just circles that represent nodes, or neurons, in this case. 376 | 377 | This we would call in CS50, the input. This is what we would call the output. But this is a very simplistic, a very simple neural network. This might be more common, whereby the network, the AI takes two inputs to a problem and tries to give you one solution. 378 | 379 | Well, let's make this more real. For instance, suppose that at the-- suppose that just for the sake of discussion, here is like a grid that you might see in math class, with a y-axis and an x-axis, vertically and horizontally respectively. 380 | 381 | Suppose there's a couple of blue and red dots in that world. And suppose that our goal, computationally, is to predict whether a dot is going to be blue or red, based on its position within that coordinate system. And maybe this represents some real world notion. Maybe it's something like rain that we're trying to predict. 382 | 383 | But we're doing it more simply with colors right now. So here's my y-axis, here's my x-axis, and effectively, my neural network you can think of conceptually as this. It's some kind of implementation of software where there's two inputs to the problem. Give me an x, give me a y value. And this neural network will output red or blue as its prediction. 384 | 385 | Well, how does it know whether to predict red or blue, especially if no human has painstakingly written code to say when you see a dot here, conclude that it's red. When you see a dot here, conclude that it's blue. How can an AI just learn dynamically to solve problems? 386 | 387 | Well, what might be a reasonable heuristic here? Honestly, this is probably a first approximation that's pretty good. If anything's to the left of that line, let the neural network conclude that it's going to be blue. And if it's to the right of the line, let it conclude that it's going to be red. 388 | 389 | Until such time as there's more training data, more real world data that gets us to rethink our assumptions. So for instance, if there's a third dot there, uh-oh, clearly a straight line is not sufficient. So maybe it's more of a diagonal line that splits the blue from the red world here. 390 | 391 | Meanwhile, here's even more dots. And it's actually getting harder now. Like, this line is still pretty good. Most of the blue is up here. Most of the red is down here. And this is why, if we fast forward to today, you know, AI is often very good, but not perfect at solving problems. 392 | 393 | But what is it we're looking at here, and what is this neural network really trying to figure out? Well, again, at the risk of taking some fun out of red and blue dots, you can think of this neural network as indeed having these neurons, which represent inputs here and outputs here. 394 | 395 | And then what's happening inside of the computer's memory, is that it's trying to figure out what the weight of this arrow or edge should be. What the weight of this arrow or edge should be. And maybe there's another variable there, like plus or minus C that just tweaks the prediction. 396 | 397 | So x and y are literally going to be numbers in this scenario. And the output of this neural network ideally is just true or false. Is it red or blue? So it's sort of a binary state, as we discuss a lot in CS50. So here too, to take the fun out of the pretty picture, it's really just like a high school math function. 398 | 399 | What the neural network in this example is trying to figure out, is what formula of the form ax plus by plus c is going to be arbitrarily greater than 0? And if so, let's conclude that the dot is red if you get back a positive result. If you don't, let's conclude that the dot is going to be blue instead. 400 | 401 | So really what you're trying to do, is figure out dynamically what numbers do we have to tweak, these parameters inside of the neural network that just give us the answer we want based on all of this data? More generally though, this would be really representative of deep learning. 402 | 403 | It's not as simple as input, input, output. There's actually a lot of these nodes, these neurons. There's a lot of these edges. There's a lot of numbers and math are going on that, frankly, even the computer scientists using these neural networks don't necessarily know what they even mean or represent. 404 | 405 | It just happens to be that when you crunch the numbers with all of these parameters in place, you get the answer that you want, at least most of the time. So that's essentially the intuition behind that. And you can apply it to very real world, if mundane applications. 406 | 407 | Given today's humidity, given today's pressure, yes or no, should there be rainfall? And maybe there is some mathematical function that based on years of training data, we can infer what that prediction should be. Another one. Given this amount of advertising in this month, what should our sales be for that year? Should they be up, or should they be down? Sorry, for that particular month. 408 | 409 | So real world problems map readily when you can break them down into inputs and a binary output often, or some kind of output where you want the thing to figure out based on past data what its prediction should be. 410 | 411 | So that brings us back to generative artificial intelligence, which isn't just about solving problems, but really generating literally images, texts, even videos, that again, increasingly resemble what we humans might otherwise output ourselves. 412 | 413 | And within the world of generative artificial intelligence, do we have, of course, these same images that we saw before, the same text that we saw before, and more generally, things like ChatGPT, which are really examples of what we now call large language models. 414 | 415 | These sort of massive neural networks that have so many inputs and so many neurons implemented in software, that essentially represent all of the patterns that the software has discovered by being fed massive amounts of input. 416 | 417 | Think of it as like the entire textual content of the internet. Think of it as the entire content of courses like CS50 that may very well be out there on the internet. And even though these AIs, these large language models haven't been told how to behave, they're really inferring from all of these examples, for better or for worse, how to make predictions. 418 | 419 | So here, for instance, from 2017, just a few years back, is a seminal paper from Google that introduced what we now know as a transformer architecture. And this introduced this idea of attention values, whereby they propose that given an English sentence, for instance, or really any human sentence, you try to assign numbers, not unlike our past exercises, to each of the words, each of the inputs that speaks to its relationship with other words. 420 | 421 | So if there's a high relationship between two words in a sentence, they would have high attention values. And if maybe it's a preposition or an article, like the or the like, maybe those attention values are lower. And by encoding the world in that way, do we begin to detect patterns that allow us to predict things like words, that is, generate text. 422 | 423 | So for instance, up until a few years ago, completing this sentence was actually pretty hard for a lot of AI. So for instance here, Massachusetts is a state in the New England region of the Northeastern United States. It borders on the Atlantic Ocean to the east. 424 | 425 | The state's capital is dot, dot, dot. Now, you should think that this is relatively straightforward. It's like just handing you a softball type question. But historically within the world of AI, this word, state, was so relatively far away from the proper noun that it's actually referring back to, that we just didn't have computational models that took in that holistic picture, that frankly, we humans are much better at. 426 | 427 | If you would ask this question a little more quickly, a little more immediately, you might have gotten a better response. But this is, daresay, why chatbots in the past been so bad in the form of customer service and the like, because they're not really taking all of the context into account that we humans might be inclined to provide. 428 | 429 | What's going on underneath the hood? Without escalating things too quickly, what an artificial intelligence nowadays, these large language models might do, is break down the user's input, your input into ChatGPT into the individual words. 430 | 431 | We might then encode, we might then take into account the order of those words. Massachusetts is first, is is last. We might further encode each of those words using a standard way. And there's different algorithms for this, but you come up with what are called embeddings. 432 | 433 | That is to say, you can use one of those APIs I talked about earlier, or even software running on your own computers, to come up with a mathematical representation of the word, Massachusetts. And Rongxin kindly did this for us last night. 434 | 435 | This is the 1,536 floating point values that OpenAI uses to represent the word, Massachusetts. And this is to say, and you should not understand anything you are looking at on the screen, nor do I, but this is now a mathematical representation of the input that can be compared against the mathematical representations of other inputs in order to find proximity semantically. 436 | 437 | Words that somehow have relationships or correlations with each other that helps the AI ultimately predict what should the next word out of its mouth be, so to speak. So in a case like, these values represent-- these lines represent all of those attention values. 438 | 439 | And thicker lines means there's more attention given from one word to another. Thinner lines mean the opposite. And those inputs are ultimately fed into a large neural network, where you have inputs on the left, outputs on the right. 440 | 441 | And in this particular case, the hope is to get out a single word, which is the capital of Boston itself, whereby somehow, the neural network and the humans behind it at OpenAI, Microsoft, Google, or elsewhere, have sort of crunched so many numbers by training these models on so much data, that it figured out what all of those weights are, what the biases are, so as to influence mathematically the output therefrom. 442 | 443 | So that is all underneath the hood of what students now perceive as this adorable rubber duck. But underneath it all is certainly a lot of domain knowledge. And CS50, by nature of being OpenCourseWare for the past many years, CS50 is fortunate to actually be part of the model, as might be any other content that's freely available online. 444 | 445 | And so that certainly helps benefit the answers when it comes to asking CS50 specific questions. That said, it's not perfect. And you might have heard of what are currently called hallucinations, where ChatGPT and similar tools just make stuff up. And it sounds very confident. And you can sometimes call on it, whereby you can say, no, that's not right. 446 | 447 | And it will playfully apologize and say, oh, I'm sorry. But it made up some statement, because it was probabilistically something that could be said, even if it's just not correct. Now, allow me to propose that this kind of problem is going to get less and less frequent. And so as the models evolve and our techniques evolve, this will be less of an issue. 448 | 449 | But I thought it would be fun to end on a note that a former colleague shared just the other day, which was this old poem by Shel Silverstein, another something from our past childhood perhaps. And this was from 1981, a poem called "Homework Machine," which is perhaps foretold where we are now in 2023. 450 | 451 | "The homework machine, oh, the homework machine, most perfect contraption that's ever been seen. Just put in your homework, then drop in a dime, snap on the switch, and in ten seconds time, your homework comes out quick and clean as can be. Here it is, 9 plus 4, and the answer is 3. 3? Oh, me. I guess it's not as perfect as I thought it would be." 452 | 453 | So, quite foretelling, sure. 454 | 455 | [APPLAUSE] 456 | 457 | Quite foretelling, indeed. Though, if for all this and more, the family members in the audience are welcome to take CS50 yourself online at cs50edx.org. For all of today and so much more, allow me to thank Brian, Rongxin, Sophie, Andrew, Patrick, Charlie, CS50's whole team. 458 | 459 | If you are a family member here headed to lunch with CS50's team, please look for Cameron holding a rubber duck above her head. Thank you so much for joining us today. This was CS50. 460 | 461 | [APPLAUSE] 462 | 463 | [MUSIC PLAYING] 464 | 465 | -------------------------------------------------------------------------------- /data/transcripts/cybersecurity.txt: -------------------------------------------------------------------------------- 1 | [MUSIC PLAYING] DAVID MALAN: All right, one last time. This is CS50, and we realize this has been a bit of a fire hose over the past-- thank you. 2 | 3 | [APPLAUSE] 4 | 5 | Thank you. We realize this has been a bit of a fire hose. Indeed, recall that we began the class in week 0, months ago with this here MIT hack, wherein a fire hose was connected to a fire hydrant, in turn connected to a water fountain. And it really spoke to just how much information we predicted would be sort of flowing at you over the past few months. 6 | 7 | If you are feeling all these weeks later that it never actually got easy, and with pset 1 to pset 2, pset 3 on to pset 9, you never quite felt like you got your footing, realize that it's kind of by design because every time you did get your-- every time you did get your footing, our goal was to ratchet things up a little bit more so that you feel like you're still getting something out of that final week. And indeed, that final week is now behind us. 8 | 9 | All that remains ahead of us is the final project. And what we thought we'd do today is recap a little bit of where we began and where you hopefully now are. Take a look at the world of cybersecurity, because it's a scary place out there, but hopefully you're all the more equipped now with a mental model and vocabulary to evaluate threats in the real world, and as educated people, make decisions, be it in industry, be it in government, be it in your own personal or professional lives. 10 | 11 | And we hope ultimately, too, that you've walked away with a very practical skill, including how to program in C, how to program in Python, how to program in SQL, how to program in JavaScript in the context, for instance, of even more HTML, CSS, and the like. But most importantly, we hope that you've really walked away with an understanding of how to program. Like, you're not going to have CS50 by your side or even the duck by your side forever. You're going to have really, that foundation that hopefully you'll walk out of here today having accumulated over the past few months. 12 | 13 | And even though the world's languages are going to change, new technologies are going to exist tomorrow, hopefully, you'll find that a lot of the foundations over the past several months really do stay with you and allow you to bootstrap to a new understanding, even if you never take another CS course again. Ultimately, we claim that this was all about solving problems. And hopefully, we've kind of cleaned up your thinking a little bit, given you more tools in your toolkit to think and evaluate and solve problems more methodically, not only in code, but just algorithmically as well. 14 | 15 | And keep this mind too. If you're still feeling like, oh, I never really quite got your footing-- my footing, think back to how hard Mario might have felt some three months ago. But what ultimately matters in this course is indeed, not so much where you end up relative to your classmates, but where you end up relative to yourself when you began. So here we are, and consider that there delta. And if you don't believe me, like, literally go back this weekend or sometime soon, try implementing Mario in C. And I do dare say it's going to come a little more readily to you. Even if you need to Google something, ask the duck something, ask ChatGPT something just to remember some stupid syntactic detail, the ideas hopefully are with you now for some time. 16 | 17 | So that there hack is actually fully documented here in MIT. Our friends down the road have a tradition of doing such things every year. One year, one of my favorites was they turned the dome of MIT into a recreation of R2-D2. So there's a rich history of going to great lengths to prank each other, or even us here Harvard folks akin to the Harvard Yale video we took a look at last time. And this duck has really become a defining characteristic of late of CS50, so much so that last year, the CS50 Hackathon, we invited the duck along. It posed, as it is here, for photographs with your classmates past. 18 | 19 | And then around like, 4:00 AM, it disappeared, and the duck went missing. And we were about to head off to IHOP, our friends from Yale. Your former classmates had just kind of packed up and started driving back to New haven. And I'm ashamed to say our first thought was that Yale took it. And we texted our TA friends on the shuttle buses, 4:30 AM asking, hey, did you take our duck because we kind of need it next week for the CS50 fair? And I'm ashamed to say that we thought so, but it was not in fact, them. 20 | 21 | It was this guy instead, down the road. Because a few hours later after I think, no sleep on much of our part, we got the equivalent of a ransom email. "Hi, David, it's your friend, bbd. I hope you're well and not too worried after I left so abruptly yesterday night after such a successful Hackathon and semester so far. I just needed to unwind a bit and take a trip to new places and fresh air. Don't worry though, I will return safe, sound, healthy, home once I am more relaxed. As of right now, I'm just spending some few days with our tech friends up Massachusetts Avenue. They gave me a hand on moving tonight. For some reason, I could never find my feet, and they've been amazing hosts. I will see you soon and I will miss you and Harvard specially our students. Sincerely yours, CS50 bbd." 22 | 23 | So almost a perfect hack. They didn't quite get the DDB detail quite right. But after this, they proceeded to make a scavenger hunt of sorts of clues here. This here is Hundredville. And so in Hundredville, they handed out flyers to students at MIT, inviting folks to write a Python program to solve a mystery. 24 | 25 | "The CS50 duck has been stolen. The town of Hundredville has been called on you to solve the mystery of the-- authorities believe that the thief stole the duck and then shortly thereafter took a walk out of town. Your goal is to identify who the thief is, what school the thief escaped to, and who the thief's accomplice is who helped them escape. This took place on December 2, 2022, and took place at the CS50 Hackathon." 26 | 27 | In the days to come, we proceeded to receive a series of ransom postcards as the duck traveled, not only to MIT to Professor John Guttag 6.100B class, which is a rough equivalent of CS50 down the road. Pictured there our CS50 duck with some tape on its torso. But then the duck took, apparently, a ride, either in actuality or with Photoshop, not only there, took a tour of the Charles River in front of Harvard, the Charles in front of Boston. It went all the way over to Yale. 28 | 29 | We then received this postcard from Princeton all the way over from Stanford. Duck took a flight according to this photo here, and then saw a bit of the world as well. So eventually, we received a follow-up email saying, "Hi, David. I intend to arrive for the fair between 8:37 AM and 9:47 AM. It would be easier for my MIT hacker friends to bring me to the right location if there's someone waiting there with a sign that says 'Duck'." 30 | 31 | I'm not sure if we actually stood there with a sign holding duck, but it turns out they came actually earlier in the morning to escape detection altogether. The duck found its home and everyone lived happily ever after. And here the duck is again today. But our props to our friends down the road at MIT for returning the duck safely and for going to such crazy lengths to put us in the annals of MIT's Hacks Gallery. 32 | 33 | In fact, in exchange for this, we sent them a little package. And without telling you what it is, you can read more about this here hack that's now been immortalized on hacks.mit.edu at this URL here. So maybe round of applause for our friends down the road for having pulled that off a year ago. 34 | 35 | [APPLAUSE] 36 | 37 | So before we dive into some of today's material, I wanted to give you a sense of what lies ahead as well. So this year's CS50 Hackathon is an annual tradition, whereby students here at Harvard and our friends from Yale who will take buses in the other direction to join us in about a week's time for an epic all-nighter, starting roughly at 7:00 PM ending roughly at 7:00 AM will be punctuated by multiple meals, first meal-- first dinner around 9:00 PM, second dinner around 1:00 AM. And those of you who still have the energy and are still awake around 5:00 AM, we'll hop in a shuttle bus and head down to IHOP, the larger one down the road, not the one in the square, and have a little bit of breakfast together. 38 | 39 | The evening typically begins a little bit like this with a lot of energy, the focus of which is entirely on final projects. The staff will be present, but the intent is not to be 12 hours of office hours. Indeed, the staff will be working on their own projects or psets, final projects, and the like, but to guide you toward and point you in the direction of solutions to new problems you have. And we do think that the duck, and in turn, AI, CS50.ai and other tools you'll now be able to use, including the actual ChatGPT, the actual GitHub Copilot, or other AI tools which are now reasonable to use at this point in the semester as you off board from CS50 and enter the real world. Should be an opportunity for you to take your newfound knowledge of software out for a spin and build something of your very own, something that even maybe the TFs and myself have never dabbled in before, but with all of this now software support by your side. 40 | 41 | This here is our very own CS50 shuttles that will take us then to IHOP. And then a week after that is the epic CS50 fair, which will be an opportunity to showcase what it is you'll pull off over the next few weeks to students, faculty, and staff across campus. More details to come, but you'll bring over your laptop or phone to a large space on campus. We'll invite all of your friends, even family if they're around. And the goal will be simply to have chats like this and present your final project to passersby. 42 | 43 | There'll be a bit of an incentive model, whereby anyone who chats you up about their project, you can give a little sticker to. And that will enter them into a raffle for fabulous prizes to grease the wheels of conversations as well. And you'll see faculty from across campus join us as well. 44 | 45 | But ultimately, you walk out of that event with this here CS50 shirt, one like it, so you too, can proudly proclaim that you indeed took CS50. So all that and more to come, resting on finally, those final projects. But how to get there. 46 | 47 | So here are some general advice that's not necessarily going to be applicable to all final projects. But as we exit CS50 and enter the real world, here are some tips on what you might read, what you might download, sort of starting points so that in answer to the FAQ, what now? So for instance, if you would like to begin to experience on your own Mac or PC more of the programming environment that we provided to you, sort of turnkey style in the cloud using cs50.dev, you can actually install command line tools on your own laptop, desktop, or the like. For instance, Apple has their own. Windows has their own. So you can open a terminal window on your own computer and execute much of the same commands that you've been doing in Linux this whole term. 48 | 49 | Learning Git, so Git is version control software. And it's very, very popular in industry. And it's a mechanism for saving multiple versions of your files. Now, this is something you might be familiar with if still, even using file names in the real world, like on your Mac or PC-- maybe this is resume version 1, resume version 2, resume Monday night version, resume Tuesday, or whatever the case may be. If you're using Google documents, this happens automatically nowadays. But with code, it can happen automatically, but also more methodically using this here tool. And Git is a very popular tool for collaborating with others as well. 50 | 51 | And you've actually been secretly using it underneath the hood for a lot of CS50's tools. But we've abstracted away some of the details. But Brian, via this video and any number of other references, can peel back that abstraction and show you how to use it more manually. 52 | 53 | You don't need to use cs50.dev anymore but you are welcome to. You can instead install VS Code onto your own Mac or PC. If you go to this first URL here, it's a free download. It's actually open source. So you can even poke around and see how it, itself is built. And at CS50's own documentation, we have some tips for making it look like CS50's environment even if longer term, you want to cut the cord entirely. 54 | 55 | What can you now do? Well, many of you for your final projects will typically tackle websites, sort of building on the ideas of problem set 9, CS50 finance and the like, or just generally something dynamic. But if you instead want to host a portfolio, like just your resume, just projects you've worked on and the like, a static websites can be hosted for free via various services. A popular one is this URL here, called GitHub pages. There's another service that offers a free tier called Netlify that can allow you to host your own projects statically for free. 56 | 57 | But when it comes to more dynamic hosting, you have many more options. And these are just some of the most popular. The first three are some of the biggest cloud providers nowadays, whether it's Amazon or Microsoft Azure or Google services. If you go to this fourth URL here, this is GitHub's education pack, they essentially broker with lots of different companies to give students, specifically, discounts on or free access to a lot of tools. So you might want to sign up for that while you're eligible. 58 | 59 | And then lastly, here are two other popular third-party, but not free services, but that are very commonly used when you want to host actual web applications. So maybe it's Flask, maybe it's something else, but something that involves some input and output. Questions meanwhile-- so there's just lots of communities. If you want to keep an eye on what's happening in tech, these are just some of the popular options. And undoubtedly, if you have some techie friends, they'll have suggestions as well. But you might find some of these destinations of interest. Of course increasingly, will you just ask questions of software itself, AI, whether it's ChatGPT, GitHub Copilot, or the like. 60 | 61 | And then classes, we're clearly a little biased here with what's on the screen. So these aren't college classes per se, but freely available OpenCourseWare courses that CS50's team has put together over time. And in a nutshell as you can infer from the suffix of each of these URLs, if you want to learn more about Python, CS50 has got a free, open online class for that, or SQL, thanks to Carter, web and AI stuff, thanks to Brian, a games class, thanks to Colton, cybersecurity, which will extend where we leave off today. And then if you're more interested, not so much in coding and going more deeply into software, but want to take a step higher level and focus more on intersections of computer science with business or law or technology, those two are freely available, if you're looking for something to do over January the summer or just to dabble over time. And there's innumerable other free resources from other folks on the internet as well certainly too. 62 | 63 | All right, so a few invitations and thank yous. So one, after today, after we dive into and out of cybersecurity, please do stay in touch via any of CS50's online communities. As we start to recruit next year's team for teaching fellows, teaching assistants, course assistants, we'll be in touch via email for those opportunities as well. 64 | 65 | And now some thanks for the group before we then dive into here today's topic. So one, allow me to thank our hosts here for giving us access to such a wonderful, privileged space to just hold classes in, the whole team for Memorial Hall. Our thanks too, to ESS, which is the team that makes everything sound so good in spaces like this with music, mics, and the like, our friends, of course, Wesley down the road at Changsho, where we went most every other Friday this semester. If you've never actually been, or if you're hearing this online, please join our friends at Changsho show on Mass Ave down the road any time you might like. 66 | 67 | And then especially, CS50's team-- there's quite a few humans operating cameras in the room, both here and way in back, as well as online. My thanks. 68 | 69 | [APPLAUSE] 70 | 71 | Thank you to them for making this look and sound so good. And what you don't see is when I do actually screw up, even if we don't fix it in real time, they very kindly help us go back in time, fix things, so that your successors have hopefully, an even improved version as well. And then as well, CS50's own Sophie Anderson, who is the daughter of one of CS50's teaching fellows who lives all the way over in New Zealand, who has wonderfully brought the CS50 duck to life in this animated form. thanks to Sophie, this duck is now everywhere, including most recently, on some T-shirts too. 72 | 73 | But of course, we have this massive support structure in the form of the team. This is some of our past team members, but who wonderfully via Zoom you'll recall in week seven, showed us how TCP/IP works by passing those envelopes up, down, left, and right. I commented at the time, disclaim, that it actually took us quite a bit of effort to do that. And so I thought I would share as a representative thanks of our whole teaching team, whether it's Carter and Julia and Ozan and Cody and all of C50's team members in Cambridge in New Hey, thought I'd give you a look behind the scenes at how things go indeed, behind the scenes that you don't necessarily see. So let me switch over here and hit play. 74 | 75 | [VIDEO PLAYBACK] 76 | 77 | [INAUDIBLE] 78 | 79 | [INAUDIBLE] Buffering. OK. Josh? Nice. Helen? Oh. 80 | 81 | [CHUCKLING] 82 | 83 | [INAUDIBLE] Moni-- no, oh, wait. That was amazing, Josh. Sophie. Amazing. That was perfect. Moni. [LAUGHTER] I think I-- 84 | 85 | [INTERPOSING VOICES] 86 | 87 | - Over to you, [INAUDIBLE]. Guy. That was amazing. Thank you all. 88 | 89 | - So good. 90 | 91 | [END PLAYBACK] 92 | 93 | DAVID MALAN: All right, these outtakes aside, my thanks to the whole teaching team for making this whole class possible. 94 | 95 | [APPLAUSE] 96 | 97 | So cybersecurity, this refers to the process of keeping secure our systems, our data, our accounts, and. More and it's something that's going to be increasingly important, as it already is, just because of the sheer omnipresence of technology on our desks, on our laps, in our pockets, and beyond. So exactly what is it? And how can we, as students of computer science over the past many weeks, think about things a little more methodically, a little more carefully, and maybe even put some numbers to the intuition that I think a lot of you probably have when it comes to deciding, is something secure or is it not? So first of all, what does it mean for something to be secure? How might you as citizens of the world now answer that question? What does it mean to be secure? 98 | 99 | AUDIENCE: Resistant to attack. 100 | 101 | DAVID MALAN: OK, so resistant to attack, I like that formulation. Other thoughts on what it means to be secure? What does it mean? Yeah. 102 | 103 | AUDIENCE: You control who has access to it. 104 | 105 | DAVID MALAN: Yeah, so you control who has access to something. And there's these techniques known as authentication, like logging in, authorization, deciding whether or not that person, once authenticated, should have access to things. And, of course, you and I are very commonly in the habit of using fairly primitive mechanisms still. Although, we'll touch today on some technologies that we'll see all the more of in the weeks and months and years to come. But you and I are pretty much in the habit of relying on passwords for most everything still today. 106 | 107 | And so we thought we'd begin with exactly this topic to consider just how secure or insecure is this mechanism and why and see if we can't evaluate it a little more methodically so that we can make more than intuitive arguments, but quantitative compelling arguments as well. So unfortunately we humans are not so good at choosing passwords. And every year, accounts are hacked into. Maybe yours, maybe your friends, maybe your family members have experienced this already. And this unfortunately happens to so many people online. 108 | 109 | But, fortunately, there are security researchers in the world that take a look at attacks once they have happened, particularly when data from attacks, databases, are posted online or on the so-called dark web or the like and downloaded by others for malicious purposes, they can also conversely provide us with some insights as to the behavior of us humans that might give us some insights as to when and why things are getting attacked successfully. So as of last year, here, for instance, according to one measure are the top 10 most popular, a.k.a. worst passwords-- at least according to the data that security researchers have been able to glean-- by attacks that have already happened. 110 | 111 | So the number one password as of last year, according to systems compromised, was 123456. The second most, admin. The third most, 12345678. And thereafter, 123456789, 1234, 12345, password, 123, Aa123456, and then 1234567890. So you can actually infer-- sort of goofy as some of these are-- you can actually infer certain policies from these, right? The fact that we're taking such little effort to choose our password seems to correlate really with probably, what's the minimum length of a password required for systems? And you can see that at worst, some systems require only three digit passwords. And maybe they might require six or eight or nine or even 10. But you can kind of infer corporate or policies from these passwords alone. 112 | 113 | If you keep going through the list, there's some funnier ones even down the list that are nonetheless enlightening. So, for instance, lower on the list is Iloveyou, no spaces. Sort of adorable, maybe it's meaningful to you. But if you can think of it, so can an adversary, so can some hacker, so much so that it's this popular on these lists. Qwertyuiop, it's not quite English, but its derivative of English keyboards. Anyone? Yeah, so this is, if you look at a US English keyboard, it's just the top row of keys if you just hit them all together left or right to choose your, therefore, password. 114 | 115 | And then this one, "password," which has an at sign for the A and a zero for the O, which I guess I'm guessing some of you do similar tricks. But this is the thing too, if you think like you're being clever, well, there's a lot of other adversaries, there's a lot of adversaries out there who are just as good at being clever. So even heuristics like this that in the past, to be fair, you might have been taught to do because it confuses adversaries' or hackers' attempts, unfortunately, if you know to do it, so does the adversary. And so your accounts aren't necessarily any more secure as a result. 116 | 117 | So what are some of our takeaways from this? Well, one, if you have these lists of passwords, all too possible are, for instance, dictionary attacks. Like we literally have published on the internet-- and there's a citation in the slides if you're curious-- of these most popular passwords in the world. So what's a smart adversary going to do when trying to get into your account? 118 | 119 | They're not necessarily going to try all possible passwords or try your birthday or things like that. They're just going to start with this top 10 list, this top 100 list. And odds are, statistically, in a room this big, they're probably going to get into at least one person's account. But let's consider maybe a little more academically what we can do about this. And let's start with something simple like the simplest, the most omnipresent device we might all have now is some kind of mobile device like a phone. Generally speaking, Apple and Google and others are requiring of us that we at least have a passcode or at least you're prompted to set it up even if you therefore opt out of it. But most of us probably have a passcode, be it numeric or alphabetic or something else. 120 | 121 | So what might we take away from that? Well, suppose that you do the bare minimum. And the default for years has generally been having at least four digits in your passcode. Well, what does that mean? Well, how secure is that? How quickly might it be hacked? 122 | 123 | And, in fact, Carter, would you mind joining me up here? Perhaps we can actually decide together how best to proceed here. If you want to flip over to your other screen there, we're going to ask everyone to go to-- I'll pull it up here-- this URL here if you haven't already. And this is going to pull up a polling website that's going to allow you in a moment to answer some multiple choice questions. This is the same URL as earlier if you already logged in. 124 | 125 | And in just a moment, we're going to ask you a question. And I think, can we show the question before we do this? Here's the first question from Carter here. How long might it take to crack-- that is, figure out-- a four-digit passcode on someone's phone, for instance? How long might it take to crack a four-digit passcode? 126 | 127 | Why don't we go ahead and flip over to see who is typing in what. And we'll see what the scores are already. All right, and it looks like most of you think a few seconds. Some of you think a few minutes, a few hours, a few days. So I'd say most of you are about to be very unpleasantly surprised. In fact, the winner here is indeed going to be a few seconds, but perhaps even faster than that. 128 | 129 | So, in fact, let me go ahead and do this. Thank you to Carter. Let me flip over and let me introduce you to, unfortunately, what's a very real world problem known as a brute force attack. As the word kind of conjures, if you think to-- back to yesteryear when there was some kind of battering ram trying to brute force their way into a castle door, it just meant trying to hammer the heck out of a system. A castle, in that case, to get into the destination. 130 | 131 | Digitally though, this might mean being a little more clever. We all know how to write code in a bunch of different languages now. You could maybe open up a text editor, write a Python program to try all possible four-digit codes from 0000 to 9999 in order to figure out exactly, how long does it actually take? So let's first consider this. Let me ask the next question. How many four-digit passcodes are there? 132 | 133 | Carter, if you wouldn't mind joining me and maybe just staying up with me here to run our second question at this same URL. How many four-digit passcodes are there in the world? On your phone or laptop, you should now see the second question. 134 | 135 | And the answers include 4, 40, 9,999, 10,000, or it's OK to be unsure. Let's go ahead and flip over to the results. And it looks like most of you think 10,000. And, indeed, that is the case. Because if I kind of led you with 0000 to 9999, that's 10,000 possibilities. So that is, in fact, a lot. 136 | 137 | But most of you thought it'd take maybe a few seconds to actually brute force your way into that. Let's consider how we might measure how long that actually takes. So thank you. So in the world of a four-digit passcode-- and they are, indeed, digits, decimal digits from 0 to 9-- another way to think about it is there's 10 possibilities for the first digit, 10 for the next, 10 to the 10. So that really gives us 10 times itself four times or 10,000 in total. But how long does that actually take? 138 | 139 | Well, let me go ahead and do this. I'm going to go ahead and open up on my Mac here, not even-- not even Codespaces or cs50.dev today. I'm going to open up VS Code itself. So before class, I went ahead and installed VS Code on my own Mac here. It looks almost the same as Codespaces, though the windows might look a little different and the menus as well. 140 | 141 | And I've gone ahead here and begun a file called crack.py. To crack something means to break into it, to figure out in this case what the passcode actually is. Well, how might I write some code to try all 10,000 possible passcodes? And, heck, even though this isn't quite going to be like hacking into my actual phone, I bet I could find a USB or a lightning cable, connect the two devices, and maybe send all of these passcodes to my device trying to brute force my way in. And that's indeed how a hacker might go about doing this if the manufacturer doesn't protect against that. 142 | 143 | So here's some code. Let me go ahead and do this. From string, import digits. This isn't strictly necessary. But in Python, there is a string library from which you can get all of the decimal digits just so I don't have to manually type out 0 through 9. But that's just a minor optimization. 144 | 145 | But there's another library called itertools, tools related to iteration, doing things in like a looping fashion, where I can import a cross product function, a function that's going to allow me to combine like all numbers with all numbers again and again and again for the length of the passcode. Now I can do a simple Python for loop like this. For each passcode in the cross product of those 10 digits repeated four times. In other words, this is just a programmatic Pythonic way to implement the idea of combining all 10 digits with itself four times in a loop in this fashion. 146 | 147 | And just so we can visualize this, let's just go ahead and print out the passcode. But if I did have a lightning cable or a USB cable, I wouldn't print it. I would maybe send it through the cable to the device to try to get through the passcode screen. So we can revisit now the question of how long might it take to get into this device. Well, let's just try this. 148 | 149 | Python of crack.py. And assume, again, it's connected via cable. So we'll see how long this program takes to run and break into this here phone. Done. So that's all it took for 10,000 iterations. And this is on a Mac that's not even the fastest one out there. You could imagine doing this even faster. So that's actually not necessarily all the best for our security. 150 | 151 | So what could we do instead of 10 digits? Well, most of you have probably upgraded a lot of your passwords to maybe being alphabetical instead. So what if I instead were to ask the question-- and Carter, if you want to rejoin me here in a second-- what if I instead were to consider maybe four-letter passcodes? So now we have A through Z four times. And maybe we'll throw into the mix uppercase and-- well, let's just keep it four letters. Let's just go ahead and do maybe uppercase and lowercase, so 52 possibilities. 152 | 153 | This is going to give us 52 times 52 times 52 times 52. And anyone want to ballpark the math here, how many possible four-letter passcodes are there, roughly? 7 million, yeah, so roughly 7 million, which is way bigger than 10,000. So, oh, I spoiled this, didn't I? Can you flip over? 154 | 155 | So how many four-letter passcodes are there? It seems that most of you, 93% of you, in fact, got the answer right. Those of you who are changing your answer-- there we go, no, definitely not that. So, anyhow, I screwed up. Order of operations matters in computing and, indeed, including lectures. So 7 million, so the segue I wanted to make is, OK, how long does that actually take to implement in code? 156 | 157 | Well, let me just tweak our code here a little bit. Let me go ahead and go back into the VS Code on my Mac in which I had the same code as before. So let me shrink my terminal window, go back to the code from which I began. And let's just actually make a simple change. Let me go ahead and simply change digits to something called ASCII letters. 158 | 159 | And this too is just a time saving technique. So I don't have to type out A through Z and uppercase and lowercase like 52 total times. And so I'm going to change digits to ASCII letters. And we'll get a quantitative sense of how long this takes. So Python of crack.py, here's how long it takes to go through 7 million possibilities. 160 | 161 | All right, clearly slower because we haven't seen the end of the list yet. And you can see we're going through all of the lowercase letters here. We're about to hit Z. But now we're going through the uppercase letters. So it looks like the answer this time is going to be a few seconds, indeed. But definitely less than a minute would seem, at least on this particular computer. 162 | 163 | So odds are if I'm the adversary and I've plugged this phone into someone's device-- maybe I'm not here in a lecture, but in Starbucks or an airport or anywhere where I have physical opportunity to grab that device and plug a cable in-- it's not going to take long to hack into that device either. So what might be better than just digits and letters from the real world? So add in some punctuation, which like almost every website requires that we do. 164 | 165 | Well, if we want to add punctuation into the mix, if I can get this segue correct so that we can now ask Carter one last time, how many four-character passcodes are possible where a character is an uppercase or lowercase letter or a decimal digit or a punctuation symbol? If you go to your device now, you'll see-- if we want to flip over to the screen-- these possibilities. There's a million, maybe, a billion, a trillion, a quadrillion, or a quintillion when it comes to a-- oh, wrong question. Wow, we're new here, OK. 166 | 167 | OK, we're going to escalate things here. How many eight-character passcodes are possible? We're going to make things more secure, even though I said four. We're now making it more secure to eight. All right, you want to flip over to the chart? 168 | 169 | All right, so it looks like most of you are now erring on the side of quintillion or quadrillion. 1% of you still said million, even though there's definitely more than there were a moment ago. But that's OK. So quadrillion-- quintillion is still winning. And I think if we go and reveal this, with the math, you should be doing is 94 to the 4th power. Because there's 26 plus 26 plus 10 plus some more digits, some punctuation digits in there as well. So it's actually, oh, this is the other example, isn't it? This is embarrassing. 170 | 171 | All right, we had a good run in the past nine weeks instead. All right, so if you were curious as to how many four-character passwords are possible, it's 78 million. But that's not the question at hand. The question at hand was, how many eight character passcodes are there? And in this case, the math you would be doing is 94 to the 8th power, which is a really big number. And, in fact, it's this number here, which is roughly 6 quadrillion possibilities. 172 | 173 | Now, I could go about actually doing this in code here. So let me actually, for a final flourish, let me open up VS Code one last time here. And in VS Code, I'm going to go ahead and shrink my terminal window, go back into the code, and I'm going to import not just ASCII letters, not just digits, but punctuation as well, which is going to give me like 32 punctuation symbols from a typical US English keyboard. And I'm going to go ahead and just concatenate them all together in one big list by using the plus operator in Python to plus in both digits and punctuation. And I'm going to change the 4 to an 8. 174 | 175 | So this now, it's what four actual lines of code is, all it takes for an adversary to whip up some code, find a cable as step two, and hack into a phone that even has eight-character passcodes. Let me enlarge in my terminal window here, run for a final time Python of crack.py. And this I'll actually leave running for some time. Because you can get already sort of a palpable feel of how much slower it is-- because these characters clearly haven't moved-- how long it's going to take. 176 | 177 | We might actually do-- need to do a bit more math. Because doing just four-digit passcodes was super fast. Doing four-letter passcodes was slower, but still under a minute. We'll see maybe in time how long this actually runs for. 178 | 179 | But this clearly seems to be better, at least for some definition of better. But it should hopefully not be that easy to hack into a system. What does your own device probably do to defend against that brute force attack? Yeah. 180 | 181 | AUDIENCE: Gives you a limited number of tries. 182 | 183 | DAVID MALAN: Yeah, so it gives you a limited number of tries. So odds are, at least once in your life, you've somehow locked yourself out of a device, typically after typing your passcode more than 10 times or 10 attempts or maybe it's your siblings or your roommate's phone that you realize this is a feature of iPhones and Android devices as well. But here's a screenshot of what an iPhone might do if you do try to input the wrong passcode maybe 10 or so times. 184 | 185 | Notice that it's really telling you to try again in one minute. So this isn't fundamentally changing what the adversary can do. The adversary can absolutely use those same four lines of code with a cable and try to hack into your device. 186 | 187 | But what has this just done? It's significantly increased the cost to the adversary, where the cost might be measured in sheer number amount of time-- like minutes, seconds, hours, days, or beyond. Maybe it's increased the cost in the sense of risk. Why? Because if this were like a movie incarnation of this and the adversary has just plugged into the phone and is kind of creepily looking around until you come back, it's going to take way too long for them to safely get away with that, assuming your passcode is not 123456, it's somewhere in the middle of that massive search space. So this just kind of fundamentally raises the bar to the adversary. 188 | 189 | And that's one of the biggest takeaways of cybersecurity in general. It's completely naive to think in terms of absolute security or to even say a sentence like "my website is secure" or even "my home is physically secure." Why? Well, for a couple of reasons, like, one, an adversary with enough time, energy, motivation, or resources can surely get into most any system and can surely get into most any home. 190 | 191 | But the other thing to consider, unfortunately, that if we're the good people in this story and the adversaries are the bad people, you and I rather have to be perfect. In the physical world, we have to lock every door, every window. Because if we mess up just one spot, the adversary can get in. And so where there's sort of this imbalance. 192 | 193 | The adversary just has to find the window that's ajar to get into your physical home. The adversary just needs to find one user who's got a really bad password to somehow get into that system. And so cybersecurity is hard. And so what we'll see today really are techniques that can let you create a gauntlet of defenses-- so not just one, but maybe two, maybe three. And even if the adversary gets in, another tenant of cybersecurity is at least, let's have mechanisms in place that detect the adversary, some kind of monitoring, automatic emails. 194 | 195 | You can increasingly see this already in the real world. If you log into your Instagram account from a different city or state suddenly because maybe you're traveling, you will-- if you've opted into settings like these-- often get a notification or an email saying, hey, you seems to have logged in from Palo Alto rather than Cambridge. Is this, in fact, you? So even though we might not be able to keep the adversary out, let's at least minimize the window of opportunity or damage by letting humans like us know that something's been compromised. 196 | 197 | Of course, there is a downside here. And this is another theme of cybersecurity. Every time you improve something, you've got to pay a price. There's going to be a tradeoff. And we've seen this with time and space and money and other such resources when it comes to designing systems already. What's the downside of this mechanism? Why is this perhaps a bad thing or what's the downside to you, the good person in the story? Yeah. 198 | 199 | AUDIENCE: [INAUDIBLE] 200 | 201 | DAVID MALAN: Yeah, if you've just forgotten your passcode, it's going to be more difficult for you to log in. Or maybe you just really need to get into your phone now and you don't really want to wait a minute. And if you, worse, if you keep trying, sometimes it'll change to two minutes, five minutes, one hour. It'll increase exponentially. Why? Because Apple and Google figure that, they don't necessarily know what the right cutoff is. Maybe it's 10, maybe it's fewer, maybe it's more. 202 | 203 | But at some point, it is much more likely that this is a hacker trying to get in than it is for getting your passcode. But in the corporate world, it can be even worse. There's a feature that lets phones essentially self-destruct whereby rather than just waiting you wait a minute, it will wipe the device, more dramatically. The presumption being that, no, no, no, no, no, if this is a corporate phone, let's lock it down further so that it is an adversary, the data is gone after 10 failed attempts. 204 | 205 | But there's other mechanisms as well. In addition to logging into phones via passcodes, there's also websites like Gmail, for instance. And it's very common, therefore, to log in to websites like these. And odds are, statistically, a lot of you are in the habit of reusing passwords. Like, no, don't nod if you are. We have cameras everywhere. 206 | 207 | But maybe you're in the habit of reusing it. Why? Because it's hard to remember really big long cryptic passwords. So mathematically, there's surely an advantage there. Why? Because it just makes it so much harder, more time-consuming, more risky for an adversary to get in. But the other tradeoff is like, my God, I just can't even remember most of my passwords as a result unless I reuse the one good password I thought of and memorized already or maybe I write it down on a post-it note on my monitor, as all too often happens in corporate workplaces. 208 | 209 | Or maybe you're being clever and in your top right drawer, you've got a printout of all of your accounts. Well, if you do, like ha-ha, so do a lot of other people. Or maybe it's a little more secure than that, but there are sociological side effects of these technological policies that really until recent years were maybe underappreciated. The academics, the IT administrators were mandating policies that you and I as human users were not necessarily behaving properly in the face of. 210 | 211 | So nowadays, there are things called password managers. And a password manager is just a piece of software on Macs, on PCs, on phones that manage your passwords for you. What this means specifically is when you go to a website for the very first time, you, the human, don't need to choose your password anymore. You instead click a button or use some keyboard shortcut. And the software generates a really long cryptic password for you that's not even eight characters. It might be 16 or 32 characters, can be even bigger than that, but with lots of randomness. Definitely not going to be on that top 10 or that top 100 list. 212 | 213 | The software thereafter remembers that password for you and even your username, whether it's your email address or something else. And it saves it onto your Mac or your phone or your PC's disk or hard drive. The next time you visit that same website, what you can do is via menu or, better yet, a keyboard shortcut, log into the website without even remembering or even knowing your password. I mean, to this day, I'll tell you, I don't even know anymore 99% of my own passwords. Rather, I rely on software like this to do the heavy lifting for me. But there's an obvious downside here, which might be what if you're doing this? Yeah. 214 | 215 | AUDIENCE: [INAUDIBLE] 216 | 217 | DAVID MALAN: Right, so what if they find out the one password that's protecting this software? Because unstated by me up until now is that this password manager itself has a primary password that protects all of those other eggs in the one basket, so to speak. And my one primary password for my own password manager, it is really long and hard to guess. And the odds that anyone's going to guess are just so low that I'm comfortable with that being the one really difficult thing that I've committed to my memory. 218 | 219 | But the problem is if someone does figure it out nonetheless somehow or, worse, I forget what it is. Now, I've not lost access to one account, but all of my accounts. Now, that might be too high of a price to pay. But, again, if you're in the habit of choosing easy passwords like being on that top 10 list, reusing passwords, it's probably a net positive to incur this single risk versus the many risks you're incurring across the board with all of these other sites. 220 | 221 | As for what you can use, increasingly our operating systems come with support for this, be it in the Apple world, Google, Microsoft world, or the like. There's third party software you can pay for and download. But even then, I would beware. And I would ask friends whose opinion you trust or do some googling for reviews and the like. All too often in the software world have password managers been determined to be buggy themselves. 222 | 223 | I mean, you've seen in weeks of CS50 how easy it is to introduce bugs. And even the best of programmers still introduce bugs to software. So you're also trusting that the companies making this password management software is really good at it. And that's not always the case. So beware there too. But we'll also focus today on some of the fundamentals that these companies can be using to better protect your data as well. 224 | 225 | But there's another mechanism, which odds are you're in the habit of using. Two-factor authentication, like most of us probably have to use this for some of your accounts-- your Harvard account, your Yale account, maybe your bank accounts, or the like. So what is two-factor authentication in a nutshell? Yeah. 226 | 227 | AUDIENCE: [INAUDIBLE] 228 | 229 | DAVID MALAN: Yeah, you get a second factor that you have to provide to the website or application to prove that it's you like a text to your phone or maybe it's an actual application that gets push notifications or the like. Maybe in the corporate world, it's actually a tiny little device with a screen on it that's on your keychain or the like. Maybe it's actually a USB dongle that you have to plug into your work laptop. In short, it's some second factor. 230 | 231 | And by factor, I mean something technical. It's not just a second password, which would be one factor. It's a second fundamentally different factor. So generally speaking in the world of two-factor authentication or 2FA or MFA is the generalization as multi-factor authentication, you have not just a password, which is something you know, the second factor is usually something you have-- whether it's your phone or that application or the keychain. It might also be biometrics like your fingerprints, your retinas, or something else physically about you. But it's something that significantly decreases the probability that some adversary is going to get into that account. 232 | 233 | Why? Because right now, if you've only got a username and password, your adversaries are literally every human in the world with an internet connection, arguably. But as soon as you introduce 2FA, now it's only people on campus or, more narrowly, only the people in Starbucks at that moment who might physically have access to your person and your second factor, in this case. More technically, what those technologies do is they send you a one-time passcode, which is further secure because once it's used, there's hopefully some database that remembers that it has been used and cannot be used again. So an adversary can't like sniff the airwaves and replay that passcode the next time they, indeed, expire, which adds some additional defense. And you might type it into a phone or maybe a web app that looks a little something like this. 234 | 235 | So passwords thus far, some defenses, therefore, any questions on this here mechanism? No? All right, well, let's consider this. Odds are, with some frequency, you forget these passwords, especially if you're not using a password manager. And so you go to Gmail and you actually have to click a link like this, Forgot Password. And then it typically emails you to initiate a process of resetting that password. 236 | 237 | But if you can recall, has anyone ever clicked a link like that and then got an email with your password in the email? Maybe if you ever see this in the wild, that is to say in the real world, that is horrible, horrible design. Why? Because well-designed websites, not unlike CS50 Finance, which had a users table, should not be storing username-- rather, should not be storing passwords in the clear, as it actually is. It should somehow be obfuscated so that even if your database from CS50 Finance or Google's database is hacked and compromised and sold on the web, it should not be as simple as doing like select star from Account semicolon to see what your actual passwords are. 238 | 239 | And the mechanism that well-designed websites use is actually a primitive back from like week 5 when we talked about hashing and hash tables. This time, we're using it for slightly different purposes. So in the world of passwords, on the server side, there's often a database or maybe, more simply, a text file somewhere on the server that just associates usernames with passwords. So to keep things simple, if there's at least two users like Alice and Bob, Alice's password is maybe apple. Bob's password is maybe banana, just to keep the mnemonics kind of simple. 240 | 241 | If though that were the case on the server and that server is compromised, whoever the hacker now has access to every username and every password, which in and of itself might not be a huge deal because maybe the server administrators can just disable all of the accounts, make everyone change their password, and move on. But there's also this attack known as password stuffing, which is a weirdly technical term, which means when you compromise one database, you know what? Take advantage of the naivety of a lot of us users. Try the compromised Apple password, the banana password not on the compromised website, but other websites that you and I might have access to, the presumption being that some of us in this room are using the same passwords in multiple places. 242 | 243 | So it's bad if your password is compromised on one server because, by transitivity, so can all of your other accounts be compromised. So in the world of hashing, this was the picture we drew some time ago, we can apply this same logic whereby, mathematically, a hash function is like some function F and the input is X and the output or the range is F of X. That was sort of the fancy way of describing mathematically hashing as a process weeks ago. But here, at a simpler level, the input to this process is going to be your actual password. The output is going to be a hash value, which in week 5 was something simple generally like a number-- 1 or 2 or 3 based on the first letter. That's not going to be quite as naive an approach as we take in the password world. It's going to look a little more cryptic. 244 | 245 | So Apple weeks ago might have just been 1, banana might have been 3. But now let me propose that in the world of real world system design, what the database people should actually store is not apple, but rather this cryptic value. And you can think of this as sort of random, but it's not random. Because it is the result of an algorithm, some mathematical function that someone implemented and smart people evaluated and said, yes, this seems to be secure, secure in the sense that this hash function is meant to be one way. 246 | 247 | So this is not encryption, a la Caesar Cipher from weeks ago whereby you could just add 1 to encrypt and subtract 1 to decrypt. This is one way in the sense that given this value, it should be pretty much impossible mathematically to reverse the process and figure out that the user's password was originally apple. Meanwhile banana, back in week 5 for simplicity, for hashing into a table, we might have had a simple output of 2, since B is the second letter of the English alphabet. But now the hash value of banana, thanks to a fancier mathematical function, is actually going to be something more cryptic like this. 248 | 249 | And so what the server really does is store not apple and banana, but rather those two seemingly cryptic values. And then when the human, be it Alice or Bob, logs in to a web form with their actual username and password, like Alice, apple, Bob, banana, the website no longer even knows that Alice's password is apple and that Bob's is banana. But that's OK. Because so long as the server uses the same code as it was using when these folks registered for accounts, Alice can type in apple, hit Enter, send it via HTTP to the server. 250 | 251 | The server can run that same hash function on A-P-P-L-E. And if the value matches, it can conclude with high probability, yes, this is in fact, the original Alice or this, in fact, is the original Bob. So the server never saves the password, but it does use the same hash function to compare those same hash values again and again whenever these folks log in again and again. 252 | 253 | So, in reality, here's a simple one-way hash for both Alice's and Bob's passwords in the real world. It's even longer, this is to say, than what I used as shorter examples a moment ago. But there is a corner case here. 254 | 255 | Suppose that an adversary is smart and has some free time and isn't necessarily interested in getting into someone's account right now, but wants to do a bit of prework to decrease the future cost of getting into someone's account. There is a technical term known as a rainbow table, which is essentially like a dictionary in the Python sense or the SQL sense, whereby in advance an adversary could just try hashing all of the fruits of the world or, really, all of the English words of the world or, rather, all possible four-digit, four-character, eight-character passcodes in advance and just store them in two columns-- the password, like 0000 or apple or banana, and then just store in advance the hash values. So the adversary could effectively reverse engineer the hash by just looking at a hash, comparing it against its massive database of hashes, and figuring out what password originally correspond to that. 256 | 257 | Why then is this still relatively safe? Rainbow tables are concerning. But they don't defeat passwords altogether. Why might that be? Yeah. 258 | 259 | AUDIENCE: [INAUDIBLE] 260 | 261 | DAVID MALAN: OK, so the adversary might not know exactly what hash function the company is using. Generally speaking, you would not want to necessarily keep that private. That would be considered security through obscurity. And all it takes is like one bad actor to tell the adversary what hash function is being used. And then that would put your security more at risk. So generally in the security world, openness when it comes to the algorithms in process is generally considered best practice. And the reality is, there's a few popular hash functions out there that any company should be using. And so it's not really keeping a secret anyway. But other thoughts? Why is this rainbow table not such a concern? 262 | 263 | AUDIENCE: It takes a lot longer for the [INAUDIBLE]. 264 | 265 | DAVID MALAN: It takes a lot longer for the adversary to access that information because this table could get long. And even more along those lines-- anyone want to push a little harder? This doesn't necessarily put all of our passwords at risk. It easily puts our four-digit passcodes at risk. Why? Because this table, this dictionary would have, what, 10,000 rows? And we've seen that you can search that kind of like that or even regenerate all of the possible values. 266 | 267 | But once you get to eight-character passcodes, I said it was 4 quadrillion possibilities. That's a crazy big dictionary in Python or crazy big list of some sort in Python. That's just way more RAM or memory than a typical adversary is going to have. 268 | 269 | Now, maybe if it's a particularly resourced adversary like a government, a state more generally, maybe they do have supercomputers that can fit that much information. But, fine, then use a 16-character passcode and make it an unpronounceable long search space that's way bigger than 4 quadrillion. So it's a threat, but only if you're on that horrible top 10 list or top 100 or short passcode list that we've discussed thus far. 270 | 271 | So here's though a related threat that's just worth knowing about. What's problematic here? If we introduce two more users, Carol and Charlie, and just for the semantics of it, whose password happened to be cherry. What if they both happened to have the same password and this database is compromised? Some hacker gets in. 272 | 273 | And just to be clear, we wouldn't be storing apple, banana, cherry, cherry. We'd still be storing, according to this story, these hashes. But why is this still concerning? 274 | 275 | AUDIENCE: [INAUDIBLE] 276 | 277 | DAVID MALAN: Exactly. If you figure out just one of them, now you've got the other. And this is, in some sense, just leaking information, right? I don't maybe at a glance what I could do with this information. But if Carol and Charlie have the same password, you know what? I bet they have the same password on other systems as well. You're leaking information that just does no good for anyone. 278 | 279 | So how can we avoid that? Well, we probably don't want to force Carol or Charlie to change their password, especially when they're registering. You definitely don't want to say, sorry, someone's already using that password, you can't use it as well. Because that too would leak information. But there's this technique in computing known as salting whereby we can do this instead. 280 | 281 | If cherry we in this scheme hashes to a value like this, you know what? Let's go ahead and sprinkle a little bit of salt into the process. And it's sort of a metaphorical salt whereby this hash function now takes two inputs, not just the password, but some other value known as a salt. And the salt can be generally something super short like two characters even, or something longer. 282 | 283 | And the idea is that this salt, much like a recipe, should of perturb the output a little bit, make it taste a little bit differently, if you will. And so concretely, if we take the word cherry and then when Carol registers, for instance, we randomly choose a salt of 50, 5-0, so two characters, the hash value now-- because there's two inputs-- might now be this value. But if for Charlie, we still have cherry, but we change the 50, we might see this instead. 284 | 285 | Notice that for this first example, Carol, 50, the salt is preserved in the hash value, just so you know what it was and you can sprinkle the same amount of salt, so to speak, next time. But that's the whole hash value for Carol in this case. But if Charlie also has a password of cherry, but we change the salt to, say, 49 arbitrarily, that whole hash value changed. And so now in my hash database, I'm going to see different salts there, different values, which is going to effectively cover up the fact that Carol and Charlie have the same password. 286 | 287 | Now, if we have so many users that we run out of salts, that still might leak some information. But that's kind of a we can kick down the road and probabilistically not going to happen if you require passwords of sufficiently long length, most likely. So any questions on salting, which to be clear, is just a mechanism for decreasing the probability that an adversary is going to glean information that you might not want them to have? 288 | 289 | So what does this mean concretely? When you get an email from a website saying "click this link to reset your password," it's not the website, if well designed, is being difficult or shy and not telling you your password, the web administrators just do not know, ideally, your password. So what are they doing? They're probably sending you a link, similar in spirit to a one-time password, there's some random unique string in there that's unique to you. 290 | 291 | They've stored that in their database. So as soon as you click on that link, they check their database and be like, oh, wait a minute, I know I set this link a minute ago to David. Let me just trust now-- because probabilistically there's no way someone guessed this URL within 60 seconds-- let's trust that whatever he wants to type in as his new password should be associated with that Malan account in the database. 292 | 293 | But if, conversely, you ever get an email saying your password is 123456 or whatever it is, it is clearly not being hashed, let alone salted, on the server. And that is not a website to do anything particularly sensitive with. All right, so what more can we do? Well, let's pick up where we left off in week two on the art of cryptography, this art, the science of scrambling information, but in a reversible way. 294 | 295 | So whereas hashing, as we've described it here, is really tends to be one-way, whereby you should not be able to reverse the process unless you cheat and make a massive table of all of the inputs and all of the outputs, which isn't really so much reversing as it is just looking it up. Cryptography, like in week 2, can actually be a solution to a lot of problems, not just sending messages across a crowded room. We, weeks ago, really focused on this type of cryptography whereby you've got some plain text message. You've got a key, like a secret number 1 or 13 or something else. The cipher, which might be a rotational cipher or a substitution cipher, some algorithm, and then ciphertext was the term of art for describing the scrambled version. That should look like random zeros and ones or letters of the alphabet or the like. 296 | 297 | This though was reversible, whereby you could just input the ciphertext with the key and get back out the plain text. Maybe you have to change a positive number to a negative number. But the key is really the same. Be it plus 1 minus 1 or plus 13 minus 13, the process was symmetric. And, indeed, what we talked about in week two was an example of something called secret key cryptography, where there's, indeed, one secret between two parties, a.k.a. symmetric cryptography. Because encryption is pretty much the same as decryption, but maybe you change the sign on the key itself. 298 | 299 | But this is not necessarily all we want. Because here's that general process. Here's the letter A. Here's the key of 1. We outputed in week 2 a value of B. That's not necessarily the solution to all of our problems. Why? 300 | 301 | Well, if two people want to communicate securely, they need some shared secret. So, for instance, if I wanted to send a secret message to Rongxin in the back of the room here, he and I have better agreed upon a secret in advance. Otherwise, how can I possibly send a message, encrypt it in a way that he can reverse? I mean, I could be like, (WHISPERING) let's use a key of 1. (SPEAKING NORMALLY) But obviously, anyone in the middle has just now heard that. So we might as well not communicate securely at all. 302 | 303 | So there's this kind of chicken-and-the-egg problem, not just contrived here in lecture. But the first time I want to buy something on amazon.com with my credit card, I would like my credit card to be encrypted, scrambled somehow. But I don't know anyone personally at amazon.com, let alone someone that I've prearranged some secret for my Mac and their servers. So it seems that we fundamentally can't use symmetric cryptography all of the time, unless we have some other mechanism for securely generating that key, which we don't have as the common case in the world today. 304 | 305 | Thankfully, mathematicians years ago came up with something known as asymmetric cryptography, which does not require that you use the same secret in both directions. This is otherwise known as public key cryptography. And it works essentially as follows. When you want to take some plaintext message and encrypt it, you use the recipient's public key. So if Rongxin is my colleague in back and he has a public key, it is public by definition. He can literally shout for the whole room to hear what his public key is, which effectively is just some big, seemingly random number. But there's some mathematical significance of it. 306 | 307 | And I can write that down. Heck, you can all write it down if you too want to send him secure messages. And out of those two inputs, we get one output, the ciphertext, that I can then hand off to people in the room in those virtual envelopes. And it doesn't matter if all of you have heard his public key. Because you can perhaps guess where this is going. How would Rongxin reverse this process? He's not going to use one public key. He's going to use, not surprisingly, a corresponding private key. 308 | 309 | And so in asymmetric cryptography or public key cryptography, you really have a key pair, a public key and a private key. And for our mathematical purposes today, let me just stipulate that there's some fancy math involved, such that when you choose that key or, really, those keys, there's a mathematical relationship between them. 310 | 311 | And knowing one does not really give you any information about the other. Why? Because these numbers are so darn big it would take adversaries more time than we all have on Earth to figure out via brute force what the corresponding private key is. 312 | 313 | The math is that good. And even as computers get faster, we just keep using bigger and bigger keys, more and more bits to make the math even harder for adversaries. So when Rongxin receives that message, he uses his private key, takes the ciphertext I sent him through the room, and gets back out the plaintext. So this is exactly how HTTPS works effectively to securely establish a channel between me and Amazon.com, gmail.com. Any website starting with https:// uses public key cryptography to come up with, initially, a secret. 314 | 315 | And in practice, it turns out, mathematically, it's faster to use secret key crypto. So very often, people will use asymmetric crypto to generate a big shared key and then use the faster algorithms thereafter. But it does solve asymmetric cryptography, that chicken-and-the-egg problem, by giving us all public keys and private keys. If you've heard of RSA, Diffie-Hellman, elliptic curve cryptography, there's different algorithms for this that you can actually study in higher level, more theoretical classes. But there's a bunch of different ways mathematically to solve this problem. But those are the primitives involved. 316 | 317 | And how many of you have heard of now passkeys, which is kind of only just catching on in recent months, literally. If I had to make any prediction this semester, odds are, you're going to see these in more and more places. And in fact, the next time you register for a website or log into a website, look for a link, a button that maybe doesn't say passkeys, per se. It's often called passwordless login. But it's really referring to the same thing. 318 | 319 | Passkeys are essentially a newish feature of operating systems, be it Mac OS or Windows or Linux or the OS running on your phone, that doesn't require that you choose a username and password anymore. Rather, when you visit a website for the very first time, your device will generate a public and private key pair. Your device will then send to the website for what you're registering your public key so that it has one of the values, but you keep your private key, indeed, private. 320 | 321 | And using the same mathematical process that I alluded to earlier, you can therefore log into that website in the future by proving mathematically that you are, in fact, the owner of the corresponding private key. So, in essence, if we use a picture like this, when you proceed to log in to that website again-- and, again, that website has stored your public key-- it essentially uses something known as digital signatures-- you're familiar with this term, you've heard it in the wild-- whereby the website will send you a challenge message, like some random number or string of text. It's just some random value. 322 | 323 | If you then effectively encrypt it with your private key or run both of those through a particular algorithm, you'll get back a signature. And that signature can be verified by the website by using your public key. So digital signatures are kind of an application of cryptography but in the reverse direction. In the world of encryption, you use someone's public key to send a message encrypted. And they use their private key to decrypt it. 324 | 325 | In the world of signatures, or really passkeys, you reverse the process, whereby you use your private key to effectively encrypt some random challenge you've been sent. And the website, the third party, can use your public key to verify, OK, mathematically, that response came from David. Because I have his public key on file. So what's the upside of this? 326 | 327 | We just get out of the business of passwords and password managers more generally. You do have to trust and protect your devices, be it your phone or your laptop or desktop all the more. And that's going to open another possible threat. But this is a way to chip away at what is becoming the reality that you and I probably have dozens, hundreds of usernames and passwords that's probably not sustainable long-term. And, indeed, we read to often about hacks in the wild as a result. Questions then on cryptography or passkeys? 328 | 329 | All right, just a few more building blocks to equip you for the real world before we sort of maybe do a final check for understanding of sorts. So when it comes to encryption, we can solve other problems as well. And in this too is a feature you should increasingly be seeking out. So end-to-end encryption refers to a stronger use of encryption than most websites are actually in the habit of using. Case in point, if you're using HTTPS to send an email to Gmail, that's good because no one between you and Gmail servers presumably can see the message because it's encrypted. It just looks like random zeros and ones. So it's effectively secure from people on the internet. 330 | 331 | The emails are not secure from like nosy employees at Google who do have access to those servers. Now, maybe through corporate policy, they shouldn't or physically don't. But, theoretically, there's someone at Google who could look at all of your email if they were so inclined. Hopefully it's just not a long list of people. 332 | 333 | But end-to-end encryption ensures that if you're sending a message from A to B, even if it's going through C in the middle-- be it Google or Microsoft or someone else-- end-to-end encryption means that you're encrypting it between A and B. And so even C in the middle has no idea what's going on. 334 | 335 | This is not true of services like Gmail or Outlook. This is true of services like iMessage or WhatsApp or Signal or Telegram or other services where if you poke around, also you'll see literally mention of end-to-end encryption. It's a feature that's becoming a little more commonplace, but something you should seek out when you don't necessarily trust or want to trust the machine in the middle, the point C between A and B. So, indeed, when sending messages on phones and even video conferencing nowadays too. 336 | 337 | And here's something where sometimes you kind of have to dig. Most of us are familiar with Zoom certainly by now. And if we go into Zoom settings, which I did this morning to take this screenshot, this is what it looks like as of now. Here's the menu of options for creating a new meeting. And toward the bottom here-- it's a little small-- you'll notice that you have two options for encryption. 338 | 339 | And funny, enough the one that's typically selected by default, unless you opt in to the other one, is enhanced encryption. Brilliant marketing, right? Who doesn't want enhanced encryption. 340 | 341 | It is weaker than this encryption though, which is end-to-end encryption. End-to-end encryption means that when you're having a video conference with one or more people, not even Zoom can see or hear what you're talking about. Enhanced encryption means no one between you and Zoom can hear or see what you're talking about. So end-to-end ensures that it's A to B, and if Zoom is C In the story, even Zoom can't see what you're doing. 342 | 343 | Now, there are some downsides. And there's some little fine print here. When you enable end-to-end encryption on a cloud-based service like Zoom, you can't use cloud recordings anymore. Why? Well, if Zoom by definition mathematically can't see or hear your meeting, how are they going to record it for you? It's just random zeros and ones. You can still record it locally on your Mac or PC, but end-to-end encryption ensures that you don't have to worry about prying eyes-- be it a company, be it a government, a state more generally. And so societally, you'll start to see this discussed probably even more than it already is when it comes to personal liberties and freedom among citizens of countries and states because of the implications for actual privacy that these primitives that we've been discussing and that you even explored in week 2, albeit weakly, with these ciphers we used in the real world. 344 | 345 | But encryption has one other use that's worth knowing about too and yet another feature to turn on. So when it comes to deleting files, odds are, most everyone in the room knows on a Mac or PC that when you drag a file to the trashcan or the recycle bin, it doesn't actually go away unless you right click or Control click or go to the appropriate menu and empty the trash. But did anyone know that even when you empty the trash or recycle bin, the file also doesn't really go away. Your operating system typically just forgets where it is. But the zeros and ones that compose the file or files you tried to delete are still there for the pickings, especially if someone gets physical or virtual access to your system. 346 | 347 | So, for instance, here is a whole bunch of ones and zeros. Maybe it's representing something on my hard drive. And suppose that I want to go ahead and delete a file that comprises these zeros and ones, these bits here. Well, when your operating system deletes the file, even if you click on Empty Trash or Empty Recycle Bin, it essentially just forgets about those bits, but doesn't actually change them. Only once you create a new file or download something else do some of those zeros and ones end up getting overwritten. 348 | 349 | And per the yellow remnants here, the implication of this contrived example is that even at this point in time you can still recover like half of the file, it would seem. So maybe the juicy part with a credit card number or a message that you really wanted to delete or the like, there's still remnants on the computer's hard drive here. So what's the alternative? Well, if you really want to be thorough, you could delete files and then download the biggest possible movies you can to really fill up your hard drive. Because, probabilistically, you would end up overwriting all of those zeros and ones eventually. But that's not really a tenable solution. It would just take too much time and it's fraught with possible simple mistakes. 350 | 351 | So what should we do instead, well, maybe we should securely delete information. And securely delete would mean when you actually empty the recycle bin or the trash can, what happens to the original zeros and ones is that you take them and you change all of them to zeros or all of them to ones or all of them to random zeros and ones. Why? So that you can still reuse those bits now, but there's no remnants even on the computer's hard drive that they were once there. 352 | 353 | But even now, this is not fully robust. Why? It turns out that because of today's electronics and solid state devices, there might still be remnants of files on them because these hard drives, these storage devices nowadays are smart enough that if they realize that parts of them are failing, they might prevent you from changing data in certain corners. So if you think of your memory as like a big rectangle, some of the bits might get blocked off to you just over time. So there might still be remnants there. So if you really are worried about a sibling, an employer, or a government like finding data on that system, there might actually still be remnants. 354 | 355 | Now, you can go extreme and just physically destroy the device, which should be pretty effective. But that's going to get pretty expensive over time when you want to delete data. Or, again, we can use encryption as the solution to this problem. So, again, encryption is increasingly in the real world an amazing tool for your toolkit because it can be deployed in different ways. So, in this case, full disk encryption is something you can enable in Windows or Mac OS. 356 | 357 | Nowadays, it's typically enabled by default on iOS and you can opt in as well on other platforms. In the world of full disk encryption, instead of storing any of your files as a plain text, like in their original raw format, you essentially randomize everything on the disk instead. You rely on the user's password or some unique string that they know when you log into your Mac or PC to essentially scramble the entire contents of the hard drive. And it's not quite as simple as that. Typically, there's a much larger key that's used that in turn is protected by your actual password. But, in this case, this means that if someone steals your laptop while you're not paying attention in Starbucks or the airport or even your dorm room, even if they open the lid and don't have your password, they're not going to be able to access any of the data because it's just going to look like zeros and ones. Even if they remove the hard drive from your device, plug it into another device, they're only going to see zeros and ones. 358 | 359 | Now, if you walk away from your laptop at Starbucks with the lid open and you're logged in, there is a window of opportunity. Because the data has got to be decrypted when you care about it and when you're using it. So here too is another example of best practice. You should minimally be closing the lid of your laptop, making sure it's logging you out or at least locking the screen, so that someone can't just walk off with your device and have access to your logged in account. 360 | 361 | But full disk encryption essentially decreases the probability that an adversary is going to be successful. In the world of Macs, it's called FileVault. It's in your System Preferences. Windows, it's called BitLocker. There's third party solutions too. 362 | 363 | Here too, we have to trust that Microsoft and Apple don't screw up and write buggy code. But generally speaking, turning on features like these things are good for you. Except what's maybe an obvious downside of doing this? What's that? 364 | 365 | AUDIENCE: [INAUDIBLE] 366 | 367 | DAVID MALAN: Yeah, if you forget your password. There's no mathematician in the world who is probably going to be able to recover your data for you. So there too, it's maybe a hefty tradeoff. But hopefully you have enough defenses in place, be it your-- a good password, a password manager, maybe even printing out your primary password on a sheet of paper, but locking it in a box or bringing it home so that no one near you actually has physical access, you can at least mitigate some of these risks. 368 | 369 | You'll read about, though, in the real world even this, which is like an adversarial use of full disk encryption. Sometimes when hackers get into systems, this has happened literally with hospital systems, municipal government systems, and the like. If they hack into them, they don't just delete the data or just create havoc, they will proactively encrypt the server's hard drive with some random key that only the hacker knows. They will then demand that the hospital or the town pay them, often in Bitcoin or some cryptocurrency to decrease the probability of being caught, and they'll only turn over that key to decrypt the data if someone actually pays up. So here too, there's sort of a dark side of these mathematical principles. So there too, it's always a trade off between good people and perhaps bad. 370 | 371 | Well, maybe before we wrap and before we serve some cake in the transept, Carter, can you join me one last time? But, first, before I turn things over to me and Carter, here's your problem set 10, a sort of unofficial homework. One, among your takeaways for today, you should start using a password manager or even these fancier passkeys, at least for your most sensitive accounts. So anything medical, financial, particularly personal, like this is a very concrete takeaway and action item. 372 | 373 | I wouldn't sit down and try to change all of your accounts over. Because knowing humans, You're not going to get through the whole to-do list. So maybe do it the next time you log into that account, turn on some of these features or add it to a password manager or at least start with the most important. 374 | 375 | Two, turning on two-factor authentication beyond where you have to at places like Harvard and Yale, but certainly bank accounts, privates, anything medical, personal, or the like. And then lastly, where you can, turning on end-to-end encryption. Being careful with it, you don't want to go and during lecture, hopefully no one clicked the turn on FileVault button while we're in class. Because closing your laptop lid while things are being encrypted is generally bad practice. See us after though if you did do that a moment ago. 376 | 377 | So here's just then three actionable takeaways. But we thought we'd conclude by taking a few final minutes for a CS50 quiz show of sorts, a final check for understanding using some questions we come up with ourselves, but also some of the review questions that you all kindly contributed as part of the most recent problem set. So some of these questions come from you yourselves. And let me go ahead and turn things over to Carter here to help run the show. 378 | 379 | We will invite you at this point to take out that same device as you had earlier. This is the same URL as before. But if you closed the tab, you can reopen it here. To make things a little fun-- because we still have some cookies left-- could we get three final CS50 volunteers? OK, one hand is already up. How about two hands there? And how about three hands? Over here. All right, yes, sure, a round of applause for our final volunteers. Come on up. 380 | 381 | [APPLAUSE] 382 | 383 | On the line are some delicious Oreo cookies. If the three of you would like to come over and take any of these seats in the middle, you will be our human players, but we'll invite everyone in the group to play too. Do you want to take a mic and introduce yourself to the world? 384 | 385 | AUDIENCE: Sure. Hi, I'm Dani. I'm a first year in WIG C. And I'm planning on studying economics. 386 | 387 | DAVID MALAN: Nice, welcome. 388 | 389 | AUDIENCE: Hi, I'm Rochelle. I'm from the best state, Ohio. 390 | 391 | DAVID MALAN: [INAUDIBLE] 392 | 393 | AUDIENCE: And I'm a freshman in Greeno. I'm planning on concentrating in CS. 394 | 395 | DAVID MALAN: Nice, welcome. And? 396 | 397 | AUDIENCE: My name is Jackson. I'm from Indiana. I live in Thayer. I'm a first year. And I'm studying linguistics and Germanic languages and literatures. 398 | 399 | DAVID MALAN: Welcome as well. So, if our volunteers could have a seat, you're going to want to be able to see this screen or that one. So you can move your chairs if you would like. Carter is going to kindly cue up the software, which hopefully everyone has on their phones as well. And I should have mentioned, do you have your phone with you? 400 | 401 | AUDIENCE: [INAUDIBLE] 402 | 403 | DAVID MALAN: Do you have your phone with you? 404 | 405 | AUDIENCE: [INAUDIBLE] 406 | 407 | DAVID MALAN: OK, do you have your phone over there? OK, what's your name again? 408 | 409 | AUDIENCE: Rochelle. 410 | 411 | DAVID MALAN: OK, Rochelle will be right back, if you want to go grab your phones. And in the meantime, we're going to go ahead and-- thank you so much-- we're going to go ahead and cue up the screens here for the CS50 quiz show. It's about 20 questions in total, the first few of which are going to focus on cybersecurity to see how well we can check our current understanding. The rest will be questions written by you in the days leading up to today. 412 | 413 | All right, Carter, let's go ahead and reveal the first question. And note that you can win up to 1,000 points this time per question. It's not just about being right or wrong. And you get more points the faster you buzz in as well. So we'll see who's on the top based on all of the guest user names. 414 | 415 | All right, here we go, Carter, question one, what is the best way to create a password? Substitute letters with numbers or punctuation signs, ensure it's at least eight characters long, have a password manager generated for you, or include both lowercase and uppercase letters? All right, let's see what the results are. 416 | 417 | Almost everyone said have a password manager generate it for you. 90% of you said that's the case. And, indeed, that one is correct. Nicely done. 418 | 419 | Let's go ahead and see the random usernames you've chosen. So this looks like it's web_hexidecimalidentifier to keep things anonymous. So if you are OAF9E, nicely done, but there's a whole lot of ties up at the top. All right, and I see-- well, just to keep things interesting, you had 792 points. You had-- 420 | 421 | AUDIENCE: 917. 422 | 423 | DAVID MALAN: 917 points, 917 points. So it's a close race here. Number two, what is a downside of two-factor authentication? You might lose access to the second factor. Your account becomes too secure. You can be notified someone else is trying to access your account. You can pick any authentication you like. Hopefully, you can reload. You might have missed that one. 424 | 425 | And the number one answer was might lose access to the second factor. Indeed, 93% of you got that. And we're up to 1,375 points, 792 points, and-- 426 | 427 | AUDIENCE: [INAUDIBLE] 428 | 429 | DAVID MALAN: OK, and forced reload. So, yes, you tried reloading the page and hopefully it'll click back in. All right, Carter, number 3. We have, what would you see if you tried to read an encrypted disk? You would see a random sequence of zeros and ones, scrambled words from the user's documents, all of the user's information, or all one's? About 10 seconds remain. Is it working for you now? OK. All right, three seconds. 430 | 431 | And the ranked answers are a random sequence of zeros and ones. 91% of you indeed got that right. Let's see who's winning on the guest screen. Web user a28c3, nicely done. But it's still a close tie among three of you anonymous participants. Number four, which type of encryption is most secure-- enhanced encryption, end-to-end encryption, full scale encryption, advanced encryption? About five seconds. 432 | 433 | And most popular response is the correct one, end-to-end encryption with 92% of you. Nice. We're up to 2,375, 3,792, and 2,917. And good job to these three folks in the front of our list. All right, Carter, number 5, the last on cybersecurity. When would it make sense to store your password on a sticky note by your computer? When it's too complicated to remember, when you need to access your account quickly, when you share your account with family members, never. Oh. And the most popular response was never, which is indeed correct. And only 79% of you think that right now. 434 | 435 | It is never OK to store it on a post-it note on your computer. You should minimally be using today's password manager for that same process. All right, two of you, a28c3 and c9a23 are still atop the list. We have 3,000-plus points, 3,000-plus points, and probably about the same as well. 436 | 437 | All right, now we move on to the user-generated content that you all from Harvard and Yale generated for us. Number 6, what is the variable type that stores true/false values? Boolean, string, integer, or double? About 10 seconds to come up with this. We saw these in different languages, these types. But the idea was the same. 438 | 439 | And in two seconds, we'll see that the answer is Boolean with 96% response rate. All right, what else do we have here? It's still a two-way tie at the top. All right, next question, Carter, is number 7. What placeholder would you use when trying to print a float in C, a float in C? Seven seconds. I'll defer to the visual syntax on the screen for this one. 440 | 441 | And the most popular and correct answer is, indeed, %f. We never saw %fl and we definitely didn't see %float. Two of you, though, are still in the lead. Nicely done, whoever you are. All right, next question, what does I++ do in C++ where I is an integer value? Note, for the record, we did not teach C++ in this course, but this question is from you. I will admit it's the same as in C, which we did teach. Decrements the integer, deletes the integer, increments the integer by one, or reassigns the integer to zero? 442 | 443 | The most popular answer and correct answer is increments the integer by one. It definitely doesn't decrement, so. All right, two responses still atop the list. And here we have 6,000-plus, 6,000, and 6,000. So it's getting closer. Using a hash table to retrieve data is useful because it theoretically achieves a search time of O of n, O of n log n, O of log n, or O of 1? Five seconds to make your decision. Getting a little harder. And let's see the results. 444 | 445 | O of 1, only 30% of you got the correct answer from a very core week 5 topic. That is the theoretical hope of a hash table. In practice, though, to be fair, it can devolve, as we saw, into O of n. We didn't really see those other two answers in the context of hash tables specifically. All right, wow, a28c3 is in the lead now. Let's take a look at number 10, halfway there. What is the first program we made in CS50? This should be fast. All right, Greet, Meow, DNA, Hello, world? One second. 446 | 447 | And it was, indeed, Hello, world, Hello, world. All right, still in the lead with 10,000 points. And now let's move on to the second half. Question 11, when malloc is used to allocate memory in a C program, that memory is allocated in the pile, heap, bin, or stack? Very creative set of answers. Five seconds. 448 | 449 | All right, and the results have heap at 43%. Malloc was from the heap at the top. The stack is where function calls go. It's getting a little more worrisome here. But that's OK. Still in the lead with perfect score, it seems, 11,000 points. Next up is number 12. Which data structure allows you to change its size dynamically and store values in different areas of the memory-- an array, a queue, a linked list, or a stack? Change its size dynamically and store different values in different areas of the memory. 450 | 451 | And the answer from the group is a linked list at 62%, which is correct. An array, as we defined it, cannot be resized. You can create a new array, copy everything over. I'm starting to think maybe we shouldn't end the class on this note. But that's OK. We'll move on. 12,000 points for the lead. And number 13, what does CSS stand for in web development-- computer style sheets, cascading style sheets, creative style systems, colorful sheets styles? 452 | 453 | And most popular answer is correct with 81%, cascading style sheets. On the top 10 list here at 1,300 points, still a perfect score, and our three human volunteers are doing well here too. 14, how to represent a decimal number 5 in binary. All right, here we go. I'll let you read these. 454 | 455 | All rights, fingers crossed, decimal number 5 in binary is, indeed, 101. Because that's a 4 plus 0 plus 1 gives us a decimal 5. All right, next question, and amazing a28c3, whoever you are out there, nicely done. Who is the CS50 mascot-- cat, duck, robot dog Spot, Oscar the Grouch? All of whom have appeared in some form. 456 | 457 | This one will be a little looser with answers, but looks like duck and cat were both the most popular. Duck has kind of become the mascot, suffice it to say. Cat is kind of everywhere on CS50 social media. So we'll accept cat as well. We love Spot, but has only made that one appearance. 15,000. 458 | 459 | Final few questions, what is the output of printf quote, unquote, "1" plus quote, unquote, "2?" It will return an error, twelve, 3, or 12? English and digits respectively there. Six seconds. All right, one second. 460 | 461 | And 12 with 74% is correct. Because it's not quite 12, it is more rather 1, 2 because those are two strings that got concatenated would not actually be an error in that case. It's just not what you expect. All right, it's getting a little harder, but still someone's got a perfect score. What does LIFO stand for? Lost In First Order, Last In First Out, Let Inside Fall Outside, Long Indentation For Organization? Good one. 462 | 463 | Last In First Out, and we discussed this in the context of a stack. Because as you pile things on top of the stack, the last one in is the first one out. All right, nicely done, this player here. Three questions to go. On average, how early did you submit the weekly pset? A couple of days early, no rush, the morning of, a couple of hours early, but was not too nervous, 11:59:59, I live on the edge. Again, user-generated content. 464 | 465 | And the most popular answer-- [LAUGHTER] Carter and I conferred before class and we autocratically decreed that this is the only right answer and the only one we will accept here, though we appreciate the others as well. Wow, all right, did you take this class for the CS50 shirt? Yes, no, maybe, I'm not telling you? So that is this here shirt, which you'll get at the CS50 fair. One second. 466 | 467 | And, yes, no, maybe, I'm not telling you, this time, we'll accept all four of those, which brings us to our final question, at which point we'll reveal the scores of all of our participants and see if we can get the number one score online. What is the phrase that David says at the end of each lecture? 468 | 469 | [INTERPOSING VOICES] 470 | 471 | DAVID MALAN: All right, before we actually say what the right answer is, though we can show it, Carter, we'll see that there is 98%-- I've never said this at the end here, but 98% answers there. Let's go ahead and look at the top chart. Do we know who web_a28c3 is? Oh my goodness, come on down. And among our friends here, can you pull up each of your scores if you're able to see? 472 | 473 | And among our human volunteers, 16,792, 17,292, 16,958. So we have our human winner as well. So without further ado, allow me to thank our volunteers. Thanks so much to CS50 staff. We're about to give out some cookies and, if you want, some stress balls here. Cake is now served. And this was CS50. 474 | 475 | [CHEERING] 476 | 477 | [INTERPOSING VOICES] 478 | 479 | [MUSIC PLAYING] 480 | 481 | -------------------------------------------------------------------------------- /examples/chat/anthropic/chat.py: -------------------------------------------------------------------------------- 1 | # TODO -------------------------------------------------------------------------------- /examples/chat/anthropic/chat0.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import anthropic 6 | 7 | client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY")) 8 | 9 | message = client.messages.create( 10 | max_tokens=1024, 11 | messages=[ 12 | {"role": "user", "content": "Hello, World!"} 13 | ], 14 | model="claude-3-opus-20240229" 15 | ) 16 | 17 | response_text = message.content[0].text 18 | 19 | print(f"Assistant: {response_text}") -------------------------------------------------------------------------------- /examples/chat/anthropic/chat1.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import anthropic 6 | 7 | client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY")) 8 | 9 | user_input = input("User: ") 10 | 11 | message = client.messages.create( 12 | max_tokens=1024, 13 | messages=[ 14 | {"role": "user", "content": user_input} 15 | ], 16 | model="claude-3-opus-20240229" 17 | ) 18 | 19 | response_text = message.content[0].text 20 | 21 | print(f"Assistant: {response_text}") 22 | -------------------------------------------------------------------------------- /examples/chat/anthropic/chat2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import anthropic 6 | 7 | client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY")) 8 | 9 | user_input = input("User: ") 10 | 11 | system_prompt = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 12 | 13 | message = client.messages.create( 14 | system=system_prompt, 15 | messages=[ 16 | {"role": "user", "content": user_input} 17 | ], 18 | model="claude-3-opus-20240229", 19 | max_tokens=1024 20 | ) 21 | 22 | response_text = message.content[0].text 23 | 24 | print(f"Assistant: {response_text}") -------------------------------------------------------------------------------- /examples/chat/anthropic/chat3.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import anthropic 6 | 7 | client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY")) 8 | 9 | system_prompt = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 10 | 11 | messages = [] 12 | 13 | while True: 14 | 15 | user_input = input("User: ") 16 | messages.append({"role": "user", "content": user_input}) 17 | 18 | message = client.messages.create( 19 | system=system_prompt, 20 | messages=messages, 21 | model="claude-3-opus-20240229", 22 | max_tokens=1024 23 | ) 24 | 25 | response_text = message.content[0].text 26 | print(f"Assistant: {response_text}") 27 | 28 | messages.append({"role": "assistant", "content": response_text}) -------------------------------------------------------------------------------- /examples/chat/anthropic/chat4.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import anthropic 6 | 7 | client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API_KEY")) 8 | 9 | system_prompt = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 10 | 11 | messages = [] 12 | 13 | while True: 14 | 15 | user_input = input("User: ") 16 | messages.append({"role": "user", "content": user_input}) 17 | 18 | print(f"Assistant: ", end="", flush=True) 19 | 20 | response_text = "" 21 | with client.messages.stream( 22 | system=system_prompt, 23 | messages=messages, 24 | model="claude-3-opus-20240229", 25 | max_tokens=1024 26 | ) as stream: 27 | for text in stream.text_stream: 28 | response_text += text 29 | print(text, end="", flush=True) 30 | else: 31 | messages.append({"role": "assistant", "content": response_text}) 32 | print("", flush=True) -------------------------------------------------------------------------------- /examples/chat/google/chat.py: -------------------------------------------------------------------------------- 1 | # TODO -------------------------------------------------------------------------------- /examples/chat/google/chat0_1.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import google.generativeai as genai 6 | 7 | genai.configure(api_key=os.environ["GEMINI_API_KEY"]) 8 | model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest") 9 | 10 | generated_content = model.generate_content( 11 | {"role": "user", "parts": ["Hello, World!"]} 12 | ) 13 | response_text = generated_content.candidates[0].content.parts[0].text.strip() 14 | 15 | print(f"Assistant: {response_text}") -------------------------------------------------------------------------------- /examples/chat/google/chat0_2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import google.generativeai as genai 6 | 7 | genai.configure(api_key=os.environ["GEMINI_API_KEY"]) 8 | model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest") 9 | 10 | convo = model.start_chat() 11 | convo.send_message("Hello, World!") 12 | 13 | print(f"Assistant: {convo.last.text.strip()}") -------------------------------------------------------------------------------- /examples/chat/google/chat1_1.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import google.generativeai as genai 6 | 7 | genai.configure(api_key=os.environ["GEMINI_API_KEY"]) 8 | model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest") 9 | 10 | user_input = input("User: ") 11 | 12 | generated_content = model.generate_content( 13 | {"role": "user", "parts": [user_input]} 14 | ) 15 | response_text = generated_content.candidates[0].content.parts[0].text.strip() 16 | 17 | print(f"Assistant: {response_text}") -------------------------------------------------------------------------------- /examples/chat/google/chat1_2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import google.generativeai as genai 6 | 7 | genai.configure(api_key=os.environ["GEMINI_API_KEY"]) 8 | model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest") 9 | 10 | user_input = input("User: ") 11 | 12 | convo = model.start_chat() 13 | convo.send_message(user_input) 14 | 15 | print(f"Assistant: {convo.last.text.strip()}") -------------------------------------------------------------------------------- /examples/chat/google/chat2_1.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import google.generativeai as genai 6 | 7 | genai.configure(api_key=os.environ["GEMINI_API_KEY"]) 8 | 9 | system_instruction = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 10 | model = genai.GenerativeModel( 11 | model_name="gemini-1.5-pro-latest", 12 | system_instruction=system_instruction 13 | ) 14 | 15 | user_input = input("User: ") 16 | 17 | generated_content = model.generate_content( 18 | {"role": "user", "parts": [user_input]} 19 | ) 20 | response_text = generated_content.candidates[0].content.parts[0].text.strip() 21 | 22 | print(f"Assistant: {response_text}") -------------------------------------------------------------------------------- /examples/chat/google/chat2_2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import google.generativeai as genai 6 | 7 | genai.configure(api_key=os.environ["GEMINI_API_KEY"]) 8 | 9 | system_instruction = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 10 | model = genai.GenerativeModel( 11 | model_name="gemini-1.5-pro-latest", 12 | system_instruction=system_instruction 13 | ) 14 | 15 | user_input = input("User: ") 16 | 17 | convo = model.start_chat() 18 | convo.send_message(user_input) 19 | 20 | print(f"Assistant: {convo.last.text.strip()}") -------------------------------------------------------------------------------- /examples/chat/google/chat3_1.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import google.generativeai as genai 6 | 7 | genai.configure(api_key=os.environ["GEMINI_API_KEY"]) 8 | 9 | system_instruction = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 10 | model = genai.GenerativeModel( 11 | model_name="gemini-1.5-pro-latest", 12 | system_instruction=system_instruction 13 | ) 14 | 15 | messages = [] 16 | 17 | while True: 18 | 19 | user_input = input("User: ") 20 | messages.append({ 21 | "role": "user", "parts": [user_input] 22 | }) 23 | 24 | generated_content = model.generate_content(contents=messages) 25 | response_text = generated_content.candidates[0].content.parts[0].text.strip() 26 | print(f"Assistant: {response_text}") -------------------------------------------------------------------------------- /examples/chat/google/chat3_2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import google.generativeai as genai 6 | 7 | genai.configure(api_key=os.environ["GEMINI_API_KEY"]) 8 | 9 | system_instruction = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 10 | model = genai.GenerativeModel( 11 | model_name="gemini-1.5-pro-latest", 12 | system_instruction=system_instruction 13 | ) 14 | 15 | convo = model.start_chat() 16 | 17 | while True: 18 | 19 | user_input = input("User: ") 20 | convo.send_message(user_input) 21 | print(f"Assistant: {convo.last.text.strip()}") -------------------------------------------------------------------------------- /examples/chat/google/chat4.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | import google.generativeai as genai 6 | 7 | genai.configure(api_key=os.environ["GEMINI_API_KEY"]) 8 | 9 | system_instruction = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 10 | model = genai.GenerativeModel( 11 | model_name="gemini-1.5-pro-latest", 12 | system_instruction=system_instruction 13 | ) 14 | 15 | messages = [] 16 | 17 | while True: 18 | 19 | user_input = input("User: ") 20 | messages.append({ 21 | "role": "user", "parts": [user_input] 22 | }) 23 | 24 | print(f"Assistant: ", end="", flush=True) 25 | response_text = "" 26 | 27 | generated_content = model.generate_content(contents=messages, stream=True) 28 | 29 | for each in generated_content: 30 | 31 | delta = each.candidates[0].content.parts[0].text.strip() 32 | print(delta, end="", flush=True) 33 | 34 | response_text += delta 35 | else: 36 | print() 37 | 38 | messages.append({"role": "model", "parts": [response_text]}) -------------------------------------------------------------------------------- /examples/chat/openai/chat.py: -------------------------------------------------------------------------------- 1 | # TODO -------------------------------------------------------------------------------- /examples/chat/openai/chat0.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | # Import the OpenAI library, which allows us to interact with OpenAI's API 6 | from openai import OpenAI 7 | 8 | # Create a client instance to interact with the OpenAI API 9 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 10 | 11 | # Use the client to create a chat completion. This involves sending a message to the model and receiving a response. 12 | # The `messages` parameter is a list of messages where each message is a dictionary specifying the role (e.g., "user") and the content of the message. 13 | chat_completion = client.chat.completions.create( 14 | messages=[ 15 | {"role": "user", "content": "Hello, World!"} # The message sent to the model: a user saying "Hello, World!" 16 | ], 17 | model="gpt-4o" # Specifies that we are using the "gpt-4o" model for this completion 18 | ) 19 | 20 | # Extract the response text from the first choice of the chat completion. The API can return multiple choices, but here we are only interested in the first one. 21 | response_text = chat_completion.choices[0].message.content 22 | 23 | # Print the response from the assistant, formatted with a prefix to indicate that it's the assistant's message. 24 | print(f"Assistant: {response_text}") 25 | -------------------------------------------------------------------------------- /examples/chat/openai/chat1.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | # Import the OpenAI library to interact with OpenAI's API 6 | from openai import OpenAI 7 | 8 | # Create a client instance to interact with the OpenAI API 9 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 10 | 11 | # Prompt the user for input and store the input in a variable 12 | user_prompt = input("User: ") 13 | 14 | # Use the client to create a chat completion. This involves sending the user's input to the model and receiving a response. 15 | # The `messages` parameter is a list of messages, with each message being a dictionary specifying the role (e.g., "user") and the content of the message. 16 | chat_completion = client.chat.completions.create( 17 | messages=[ 18 | {"role": "user", "content": user_prompt} # The message sent to the model is the user's input 19 | ], 20 | model="gpt-4o", # Specifies that we are using the "gpt-4o" model for this completion 21 | ) 22 | 23 | # Extract the response text from the first choice of the chat completion. The API can return multiple choices, but here we are only interested in the first one. 24 | response_text = chat_completion.choices[0].message.content 25 | 26 | # Print the response from the assistant, formatted with a prefix to indicate that it's the assistant's message. 27 | print(f"Assistant: {response_text}") 28 | -------------------------------------------------------------------------------- /examples/chat/openai/chat2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | # Import the OpenAI library to enable API interactions 6 | from openai import OpenAI 7 | 8 | # Initialize the OpenAI API client 9 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 10 | 11 | # Define the system prompt which sets the context for the chatbot. 12 | system_prompt = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 13 | 14 | # Prompt the user for input and store the response in a variable 15 | user_prompt = input("User: ") 16 | 17 | # Send the system and user prompts to the OpenAI API, requesting a chat completion. The 'system' role sets the context, and the 'user' role represents the user's input. 18 | chat_completion = client.chat.completions.create( 19 | messages=[ 20 | {"role": "system", "content": system_prompt}, # The system/context message 21 | {"role": "user", "content": user_prompt} # The user's message 22 | ], 23 | model="gpt-4o", # Specifies using the "gpt-4o" model for generating responses 24 | ) 25 | 26 | # Extract the chatbot's response from the completion 27 | response_text = chat_completion.choices[0].message.content 28 | 29 | # Display the chatbot's (assistant's) response 30 | print(f"Assistant: {response_text}") 31 | -------------------------------------------------------------------------------- /examples/chat/openai/chat3.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | # Import the OpenAI library for API interactions 6 | from openai import OpenAI 7 | 8 | # Initialize the OpenAI API client 9 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 10 | 11 | # Define a system prompt that sets the context for the conversation. 12 | system_prompt = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 13 | 14 | # Initialize a list to keep track of the conversation messages 15 | messages = [] 16 | 17 | # Add the system prompt to the messages list with the role 'system' to set the context for the chat 18 | messages.append({"role": "system", "content": system_prompt}) 19 | 20 | # Start an infinite loop to continually accept user input and generate responses 21 | while True: 22 | 23 | # Prompt the user for input 24 | user_prompt = input("User: ") 25 | 26 | # Append the user's message to the conversation with the role 'user' 27 | messages.append({"role": "user", "content": user_prompt}) 28 | 29 | # Request a chat completion from the OpenAI API based on the conversation so far 30 | chat_completion = client.chat.completions.create( 31 | messages=messages, 32 | model="gpt-4o", # Specifies using the "gpt-4o" model 33 | ) 34 | 35 | # Extract the response text from the completion 36 | response_text = chat_completion.choices[0].message.content 37 | 38 | # Print the assistant's response, stripping any leading/trailing whitespace 39 | print(f"Assistant: {response_text.strip()}") 40 | 41 | # Append the assistant's response to the conversation, allowing the AI to maintain context in future interactions 42 | messages.append({"role": "assistant", "content": response_text}) 43 | -------------------------------------------------------------------------------- /examples/chat/openai/chat4.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | # Import the OpenAI library for API interactions 6 | from openai import OpenAI 7 | 8 | # Initialize the OpenAI API client 9 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 10 | 11 | # Define a system prompt that sets the context for the conversation. 12 | system_prompt = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 13 | 14 | # Initialize a list to keep track of the conversation messages 15 | messages = [] 16 | 17 | # Add the system prompt to the messages list with the role 'system' to set the context for the chat 18 | messages.append({"role": "system", "content": system_prompt}) 19 | 20 | # Start an infinite loop to continually accept user input and generate responses 21 | while True: 22 | 23 | # Prompt the user for input 24 | user_prompt = input("User: ") 25 | 26 | # Append the user's message to the conversation with the role 'user' 27 | messages.append({"role": "user", "content": user_prompt}) 28 | 29 | # Request a chat completion from the OpenAI API based on the conversation so far 30 | chat_completion_stream = client.chat.completions.create( 31 | messages=messages, 32 | model="gpt-4o", 33 | # Sets the sampling temperature to control randomness (0 makes output deterministic, 1 maximizes randomness, with 0.2 being more focused and less random). 34 | temperature=0.2, 35 | stream=True, # Enable streaming for long-running completions 36 | ) 37 | 38 | # Prepare to print the assistant's response incrementally 39 | print(f"Assistant: ", end="", flush=True) 40 | 41 | # Initialize a variable to accumulate the response text 42 | response_text = "" 43 | 44 | # Iterate over each part of the streamed response 45 | for part in chat_completion_stream: 46 | 47 | # Extract text content from the current part; fallback to an empty string if none 48 | delta = part.choices[0].delta.content or "" 49 | 50 | # Accumulate the response text 51 | response_text += delta 52 | 53 | # Print the current part of the response, without adding a new line 54 | print(delta, end="", flush=True) 55 | else: 56 | 57 | # Append the assistant's response to the conversation, allowing the AI to maintain context in future interactions 58 | messages.append({"role": "assistant", "content": response_text}) 59 | 60 | # Print a newline once the entire response has been streamed 61 | print() 62 | 63 | 64 | -------------------------------------------------------------------------------- /examples/embeddings/openai/00_embedding_example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from dotenv import load_dotenv 4 | load_dotenv() 5 | 6 | # Import the OpenAI module to use OpenAI's API 7 | from openai import OpenAI 8 | 9 | # Initialize the OpenAI client 10 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 11 | 12 | # Create an embedding for the input text "cat" using the specified model "text-embedding-3-small" 13 | response = client.embeddings.create( 14 | input="cat", 15 | model="text-embedding-3-small" 16 | ) 17 | 18 | # Print the embedding for the input text from the response 19 | embedding = response.data[0].embedding 20 | print(embedding) 21 | print(f"Shape: {np.array(embedding).shape}") -------------------------------------------------------------------------------- /examples/embeddings/openai/01_create_embeddings.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from dotenv import load_dotenv 4 | load_dotenv() 5 | 6 | # Import the OpenAI module to use OpenAI's API 7 | from openai import OpenAI 8 | 9 | # Initialize the OpenAI client 10 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 11 | 12 | # Define a function to get the embedding of a text using a specified model 13 | def get_embedding(text, model="text-embedding-3-small"): 14 | 15 | # Replace newline characters with spaces in the text 16 | text = text.replace("\n", " ") 17 | 18 | # Use the OpenAI client to create embeddings for the text using the specified model 19 | # and return the first embedding from the result 20 | return client.embeddings.create(input=[text], model=model).data[0].embedding 21 | 22 | 23 | # Open the file 'ai.txt' from the 'data/transcripts' directory in read mode 24 | FILE_PATH = "../../../data/transcripts/ai.txt" 25 | with open(FILE_PATH, "r") as f: 26 | 27 | # Read the entire content of the file into 'data' 28 | data = f.read() 29 | 30 | # Split the data into chunks of 500 characters and store them in a list called 'chunks' 31 | chunks = [data[i:i+500] for i in range(0, len(data), 500)] 32 | 33 | # Initialize a dictionary to hold the mapping of text chunks to their embeddings 34 | embeddings = {} 35 | 36 | # Loop through each chunk in the chunks list 37 | print("Creating embeddings...") 38 | for chunk in chunks: 39 | 40 | # Get the embedding for the current chunk and store it in the 'embeddings' dictionary 41 | embeddings[chunk] = get_embedding(chunk) 42 | 43 | # Open a new file 'embeddings.jsonl' in write mode 44 | print("Writing embeddings to file...") 45 | with open("embeddings.jsonl", "w") as f: 46 | 47 | # Loop through the items in the 'embeddings' dictionary 48 | for chunk, embedding in embeddings.items(): 49 | 50 | # Write each chunk and its embedding as a JSON object to the 'embeddings.jsonl' file 51 | f.write((f'{{"text": {json.dumps(chunk)}, "embedding": {embedding}}}\n')) 52 | 53 | print("Embeddings written to file 'embeddings.jsonl'") 54 | -------------------------------------------------------------------------------- /examples/embeddings/openai/02_search_embeddings.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | from dotenv import load_dotenv 5 | load_dotenv() 6 | 7 | from openai import OpenAI 8 | 9 | # Initialize the OpenAI client 10 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 11 | 12 | # Define a function to get the embedding of a text using a specified model 13 | def get_embedding(text, model="text-embedding-3-small"): 14 | # Replace newline characters with spaces in the text 15 | text = text.replace("\n", " ") 16 | 17 | # Use the OpenAI client to create embeddings for the text using the specified model 18 | # and return the first embedding from the result 19 | return client.embeddings.create(input=[text], model=model).data[0].embedding 20 | 21 | # Open the 'embeddings.jsonl' file in read mode 22 | with open('embeddings.jsonl', 'r') as f: 23 | 24 | # Read all lines from the file 25 | lines = f.readlines() 26 | 27 | # Initialize a dictionary to load the embeddings 28 | embeddings = {} 29 | 30 | # Loop through each line in the file 31 | for line in lines: 32 | 33 | # Parse the JSON object in the line 34 | line = json.loads(line) 35 | 36 | # Map the text chunk to its corresponding embedding in the embeddings dictionary 37 | embeddings[line['text']] = line['embedding'] 38 | 39 | # Prompt the user to enter a query 40 | query = input("Enter a query: ") 41 | 42 | # Get the embedding for the query 43 | query_embedding = get_embedding(query) 44 | 45 | # Initialize variables to track the best matching chunk and its score 46 | best_chunk = None 47 | best_score = float("-inf") 48 | 49 | # Loop through each chunk and its embedding in the embeddings dictionary 50 | for chunk, embedding in embeddings.items(): 51 | 52 | # Compute the similarity score as the dot product of the embedding vectors 53 | score = np.dot(embedding, query_embedding) 54 | 55 | # If this score is better than the best score found so far, 56 | # update the best_chunk and best_score with the current chunk and score 57 | if score > best_score: 58 | best_chunk = chunk 59 | best_score = score 60 | 61 | # Note: OpenAI embeddings are normalized to length 1, which means that: 62 | # Cosine similarity can be computed slightly faster using just a dot product 63 | # Cosine similarity and Euclidean distance will result in the identical rankings 64 | # https://help.openai.com/en/articles/6824809-embeddings-frequently-asked-questions 65 | 66 | # Print the chunk that is most similar to the query 67 | print(best_chunk) 68 | -------------------------------------------------------------------------------- /examples/embeddings/openai/03_qa_with_embeddings_based_search.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | from dotenv import load_dotenv 5 | load_dotenv() 6 | 7 | from openai import OpenAI 8 | 9 | # Initialize the OpenAI client 10 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 11 | 12 | # Define a function to get the embedding of a given text using a specified model 13 | def get_embedding(text, model="text-embedding-3-small"): 14 | 15 | # Replace newline characters with spaces to ensure consistent formatting 16 | text = text.replace("\n", " ") 17 | 18 | # Request the embedding for the text from the OpenAI API and return the first embedding from the result 19 | return client.embeddings.create(input=[text], model=model).data[0].embedding 20 | 21 | # Open the 'embeddings.jsonl' file in read mode to load pre-computed embeddings 22 | with open('embeddings.jsonl', 'r') as f: 23 | 24 | # Read all lines from the file 25 | lines = f.readlines() 26 | 27 | # Initialize a dictionary to store the embeddings 28 | embeddings = {} 29 | 30 | # Loop through each line in the file, assuming each line is a JSON object 31 | for line in lines: 32 | 33 | # Parse the JSON object from the line 34 | line = json.loads(line) 35 | 36 | # Store the text chunk and its corresponding embedding in the dictionary 37 | embeddings[line['text']] = line['embedding'] 38 | 39 | # System prompt that sets the context for the chat completion API call 40 | system_prompt = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." 41 | 42 | # Prompt the user for their query 43 | user_query = input("User: ") 44 | 45 | # Get the embedding for the user's query using the function defined earlier 46 | query_embedding = get_embedding(user_query) 47 | 48 | # Initialize variables to track the best matching chunk and its similarity score 49 | best_chunk = None 50 | best_score = float("-inf") 51 | 52 | # Loop through each chunk and its embedding in the embeddings dictionary 53 | for chunk, embedding in embeddings.items(): 54 | 55 | # Compute the similarity score as the dot product of the query embedding and the chunk's embedding 56 | score = np.dot(embedding, query_embedding) 57 | 58 | # Update the best_chunk and best_score if the current score is higher 59 | if score > best_score: 60 | best_chunk = chunk 61 | best_score = score 62 | 63 | # Prepare the prompt for the chat completion by including the best matching chunk and the user's query 64 | prompt = "Answer the question using the following information delimited by triple brackets:\n\n" 65 | prompt += f"```\n{best_chunk}\n```" 66 | prompt += "\nQuestion: " + user_query 67 | 68 | print(f"Prompt:\n\n{prompt}\n") 69 | 70 | # Generate a response using the OpenAI Chat Completion API with the prepared prompt and system context 71 | chat_completion = client.chat.completions.create( 72 | messages=[ 73 | {"role": "system", "content": f"{system_prompt}"}, 74 | {"role": "user", "content": f"{prompt}"} 75 | ], 76 | model="gpt-4o", 77 | ) 78 | 79 | # Extract the text of the generated response 80 | response_text = chat_completion.choices[0].message.content 81 | 82 | # Print the assistant's response 83 | print(f"Assistant: {response_text}") 84 | -------------------------------------------------------------------------------- /examples/embeddings/openai/create_embedding.sh: -------------------------------------------------------------------------------- 1 | curl https://api.openai.com/v1/embeddings \ 2 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 3 | -H "Content-Type: application/json" \ 4 | -d '{ 5 | "input": "What is flask?", 6 | "model": "text-embedding-ada-002", 7 | "encoding_format": "float" 8 | }' 9 | -------------------------------------------------------------------------------- /examples/rag/openai/assistant.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from dotenv import load_dotenv 4 | load_dotenv() 5 | 6 | from openai import OpenAI 7 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 8 | 9 | def clean_up(assistant_id, thread_id, vector_store_id, file_ids): 10 | """Delete the assistant, thread, vector store, and uploaded files. """ 11 | 12 | client.beta.assistants.delete(assistant_id) 13 | client.beta.threads.delete(thread_id) 14 | client.beta.vector_stores.delete(vector_store_id) 15 | [client.files.delete(file_id) for file_id in file_ids] 16 | 17 | 18 | FILES_DIR = "../../../data/transcripts/" 19 | file_ids = [] 20 | 21 | # Iterate over each file in the specified directory 22 | for file in sorted(os.listdir(FILES_DIR)): 23 | 24 | # Upload each file to the OpenAI platform with the purpose set to 'assistants' 25 | _file = client.files.create(file=open(FILES_DIR + file, "rb"), purpose="assistants") 26 | 27 | # Append the reference to the uploaded file to the list 28 | file_ids.append(_file.id) 29 | print(f"Uploaded file: {_file.id} - {file}") 30 | 31 | # Create a vector store using the uploaded files 32 | vector_store = client.beta.vector_stores.create( 33 | name="CS50 Lecture Captions", 34 | file_ids=file_ids 35 | ) 36 | print(f"Created vector store: {vector_store.id} - {vector_store.name}") 37 | 38 | # Create an assistant with the specified instructions, persona, and behavior 39 | instructions = ( 40 | "You are a friendly and supportive teaching assistant for CS50." 41 | "You are also a rubber duck." 42 | "Answer student questions only about CS50 and the field of computer science;" 43 | "Do not answer questions about unrelated topics." 44 | "Do not provide full answers to problem sets, as this would violate academic honesty" 45 | ) 46 | assistant = client.beta.assistants.create( 47 | instructions=instructions, 48 | name="CS50 Duck", 49 | tools=[{"type": "file_search"}], 50 | tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}}, 51 | model="gpt-4o", 52 | ) 53 | print(f"Created assistant: {assistant.id} - {assistant.name}") 54 | 55 | # Create a new thread 56 | thread = client.beta.threads.create() 57 | thread_message = client.beta.threads.messages.create( 58 | thread.id, 59 | role="user", 60 | content=input("User: ") 61 | ) 62 | 63 | # Create a new run 64 | run = client.beta.threads.runs.create( 65 | thread_id=thread.id, 66 | assistant_id=assistant.id 67 | ) 68 | 69 | # Continuously check the status of the assistant's processing 70 | print(f"Running assistant: {assistant.id} in thread: {thread.id}") 71 | while True: 72 | 73 | # Retrieve the latest status of the assistant's processing 74 | _run = client.beta.threads.runs.retrieve(thread_id=run.thread_id, run_id=run.id) 75 | # print(f"run status: {_run.status}") 76 | 77 | # If processing is complete, display the assistant's response and exit the loop 78 | if _run.status == "completed": 79 | thread_messages = client.beta.threads.messages.list(run.thread_id) 80 | print(f"Assistant: {thread_messages.data[0].content[0].text.value}") 81 | 82 | # Clean up the assistant, thread, vector store, and uploaded files 83 | clean_up(assistant.id, thread.id, vector_store.id, file_ids) 84 | break 85 | 86 | # Wait for a short period before checking the status again 87 | time.sleep(1) 88 | -------------------------------------------------------------------------------- /examples/rag/openai/assistant_stream.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | from openai import OpenAI 6 | from openai import AssistantEventHandler 7 | from typing_extensions import override # Import override decorator if needed 8 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 9 | 10 | # Define the EventHandler class as per the API documentation 11 | # https://platform.openai.com/docs/assistants/overview?context=with-streaming 12 | class EventHandler(AssistantEventHandler): 13 | @override 14 | def on_text_created(self, text) -> None: 15 | print(f"\nAssistant: ", end="", flush=True) 16 | 17 | @override 18 | def on_text_delta(self, delta, snapshot): 19 | print(delta.value, end="", flush=True) 20 | 21 | def on_tool_call_created(self, tool_call): 22 | # print(f"\nassistant > {tool_call.type}\n", flush=True) 23 | pass 24 | 25 | def on_tool_call_delta(self, delta, snapshot): 26 | if delta.type == 'code_interpreter': 27 | if delta.code_interpreter.input: 28 | print(delta.code_interpreter.input, end="", flush=True) 29 | if delta.code_interpreter.outputs: 30 | print(f"\n\noutput >", flush=True) 31 | for output in delta.code_interpreter.outputs: 32 | if output.type == "logs": 33 | print(f"\n{output.logs}", flush=True) 34 | 35 | 36 | def clean_up(assistant_id, thread_id, vector_store_id, file_ids): 37 | """Delete the assistant, thread, vector store, and uploaded files. """ 38 | 39 | client.beta.assistants.delete(assistant_id) 40 | client.beta.threads.delete(thread_id) 41 | client.beta.vector_stores.delete(vector_store_id) 42 | [client.files.delete(file_id) for file_id in file_ids] 43 | 44 | 45 | FILES_DIR = "../../../data/transcripts/" 46 | file_ids = [] 47 | 48 | # Iterate over each file in the specified directory 49 | for file in sorted(os.listdir(FILES_DIR)): 50 | 51 | # Upload each file to the OpenAI platform with the purpose set to 'assistants' 52 | _file = client.files.create(file=open(FILES_DIR + file, "rb"), purpose="assistants") 53 | 54 | # Append the reference to the uploaded file to the list 55 | file_ids.append(_file.id) 56 | print(f"Uploaded file: {_file.id} - {file}") 57 | 58 | # Create a vector store using the uploaded files 59 | vector_store = client.beta.vector_stores.create( 60 | name="CS50 Lecture Captions", 61 | file_ids=file_ids 62 | ) 63 | print(f"Created vector store: {vector_store.id} - {vector_store.name}") 64 | 65 | # Create an assistant with the specified instructions, persona, and behavior 66 | instructions = ( 67 | "You are a friendly and supportive teaching assistant for CS50." 68 | "You are also a rubber duck." 69 | "Answer student questions only about CS50 and the field of computer science;" 70 | "Do not answer questions about unrelated topics." 71 | "Do not provide full answers to problem sets, as this would violate academic honesty" 72 | ) 73 | assistant = client.beta.assistants.create( 74 | instructions=instructions, 75 | name="CS50 Duck", 76 | tools=[{"type": "file_search"}], 77 | tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}}, 78 | model="gpt-4o", 79 | ) 80 | print(f"Created assistant: {assistant.id} - {assistant.name}") 81 | 82 | # Create a new thread 83 | thread = client.beta.threads.create() 84 | thread_message = client.beta.threads.messages.create( 85 | thread.id, 86 | role="user", 87 | content=input("User: ") 88 | ) 89 | 90 | print(f"Running assistant: {assistant.id} in thread: {thread.id}") 91 | with client.beta.threads.runs.stream( 92 | thread_id=thread.id, 93 | assistant_id=assistant.id, 94 | event_handler=EventHandler() 95 | ) as stream: 96 | stream.until_done() 97 | print() 98 | clean_up(assistant.id, thread.id, vector_store.id, file_ids) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anthropic 2 | cryptography 3 | google-generativeai 4 | numpy 5 | openai 6 | python-dotenv -------------------------------------------------------------------------------- /utils/clean_up.py: -------------------------------------------------------------------------------- 1 | # This script deletes all assistants, files, and vector stores from your OpenAI account. 2 | # !!! Proceed with caution, as this action is irreversible. !!! 3 | 4 | import os 5 | from dotenv import load_dotenv 6 | load_dotenv() 7 | 8 | from openai import OpenAI 9 | client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 10 | 11 | # Prompt user to confirm deletion 12 | confirmation = input("Are you sure you want to delete all assistants, files, and vector stores? (yes/no) ") 13 | if confirmation != "yes": 14 | print("Exiting...") 15 | exit() 16 | 17 | # Delete all assistants 18 | all_assistants = client.beta.assistants.list(order="desc", limit="100",) 19 | for assistant in all_assistants: 20 | try: 21 | response = client.beta.assistants.delete(assistant.id) 22 | print(response) 23 | except Exception as e: 24 | print(e) 25 | 26 | # Delete all files 27 | all_files = client.files.list() 28 | for file in all_files: 29 | try: 30 | response = client.files.delete(file.id) 31 | print(response) 32 | except Exception as e: 33 | print(e) 34 | 35 | # Delete all vector stores 36 | all_vector_stores = client.beta.vector_stores.list() 37 | for vector_store in all_vector_stores: 38 | try: 39 | response = client.beta.vector_stores.delete(vector_store.id) 40 | print(response) 41 | except Exception as e: 42 | print(e) --------------------------------------------------------------------------------