├── LICENSE.md ├── README.md ├── Week 1 Introduction to Statistics and Probability ├── 1.1 Introduction to Statistics and Probability.rtf ├── 1.2 What is Probability Theory.rtf ├── 1.3 What is Statistics.rtf ├── 1.4 A Puzzle.rtf ├── 1.5 A poster.rtf ├── 1.5 History of Probability and Statistics.rtf ├── 1.5 asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Histor.pdf ├── 1.7 Week1-IntroMotivation.zip ├── 1.8 HW_1 Py_3.6.ipynb ├── 1.8 HW_1.ipynb ├── 1.9 Programming Assignment.pdf ├── Install Software.docx ├── Install Software.pdf ├── Quiz 1.pdf ├── Week1-IntroMotivation │ ├── 1.What-is-Probability.ipynb │ ├── 2.What-is-Statistics_.ipynb │ ├── 3.Long-term-frequencies.ipynb │ ├── 4.The-Three-card-Puzzle.ipynb │ └── images │ │ ├── AB.png │ │ ├── Hitting.png │ │ └── Scorecard.jpg └── asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Week_1_sid.pdf ├── Week 10 Confidence Intervals and Hypothesis Testing ├── 1.1. Confidence Interval on Mean Part 1.rtf ├── 1.2. Confidence Interval on Mean Part 2.rtf ├── 2. Sigma Unknown.rtf ├── 2.1. Sigma Unknown Example.rtf ├── 3 Hypothesis Testing.rtf ├── 4. Hypothesis Testing - p-Values.rtf ├── 5. Tea Testing.rtf ├── 6. Hypothesis Testing - Z and T Tests.rtf ├── Expectation_variance_and_covariance.ipynb ├── HW_10.ipynb ├── Problem Set 10.pdf ├── Programming Assignment.pdf ├── Week_10_Part_1.pdf ├── Week_10_Part_2.pdf ├── Week_10_Part_3.pdf ├── Week_10_Part_4.pdf └── Week_10_Part_5.pdf ├── Week 2 Sets ├── 1 Week_2.pdf ├── 2.1 Notation Elements, sets, and membership.rtf ├── 2.2 Basic Sets Some simple sets.rtf ├── 2.3 Venn Diagrams Visualizing Sets.rtf ├── 2.4 Relations Complement, Intersection, Union.rtf ├── 2.5 Operations Set Operations.rtf ├── 2.6 Cartesian Products Tuples and products.rtf ├── 2.7 Russell's Paradox Russell's Paradox.rtf ├── 2_sets_hw.ipynb ├── Polls _ 2.11 Discussion Section _ DSE210x Courseware _ edX.html ├── Problem Set 2 _ 2.9 Problem Sets _ DSE210x Courseware _ edX.html ├── Problem Set 2.pdf ├── Programming Assignment _ 2.10 Programming Assignment _ DSE210x Courseware _ edX.html ├── Programming Assignment.pdf ├── Quiz 2 _ 2.8 Comprehension Quiz _ DSE210x Courseware _ edX.html ├── Quiz 2.pdf └── Sets.ipynb ├── Week 3 Counting and Combinatorics ├── 1 Sets.rtf ├── 10 Combinations.rtf ├── 11 Binomial Coefficient.rtf ├── 12 Properties of Binomial Coefficient.rtf ├── 13 Pascal.rtf ├── 14 Multinomial Coefficients.rtf ├── 15 Beyond Combinatorics.rtf ├── 2 Disjoint Union.rtf ├── 3 Products.rtf ├── 4 Mix It Up.rtf ├── 5 Counting Cartesian Powers.rtf ├── 6 Counting Variations.rtf ├── 7 Counting Trees.rtf ├── 8 Permutations.rtf ├── 9 Partial Permutations.rtf ├── Problem Set 3 _ 3.17 Problem Sets _ DSE210x Courseware _ edX.html ├── Programming Assignment _ 3.18 Programming Assignment _ DSE210x Courseware _ edX.html ├── Quiz 3 _ 3.16 Comprehension Quiz _ DSE210x Courseware _ edX.html ├── Week_3_Part_1.pdf ├── Week_3_Part_2.pdf └── week3.zip ├── Week 4 Probability and Conditioning ├── 1 Distribution Types.rtf ├── 10 Sequences.rtf ├── 11 Total Probability.rtf ├── 12 Bayes' Rule.rtf ├── 2 Distribution Types.rtf ├── 3 Events.rtf ├── 3_Counting.ipynb ├── 4 Repeated Experiments.rtf ├── 4_Permutations_and_Combinations.ipynb ├── 6 Axioms.rtf ├── 7 Inequalities.rtf ├── 8 Conditional Probability.rtf ├── 9 Independence.rtf ├── HW_4.ipynb ├── Problem Set 4 _ 4.14 Problem Sets _ DSE210x Courseware _ edX.html ├── Programming Assignment _ 4.15 Programming Assignment _ DSE210x Courseware _ edX.html ├── Quiz 4 _ 4.13 Comprehension Quiz _ DSE210x Courseware _ edX.html ├── Week_4_Part_1.pdf └── Week_4_Part_2.pdf ├── Week 5 Random Variables, Expectation, and Variance ├── 1 Random Variables.rtf ├── 2 Cumulative Distribution Function.rtf ├── 3 Expectation.rtf ├── 4 Variable Modification.rtf ├── 5 Expectation of Functions.rtf ├── 5_Probability.ipynb ├── 6 Variance.rtf ├── 7 Two Variables.rtf ├── 8 Linearity of Expectations.rtf ├── 9 Covariance.rtf ├── Problem Set 5 _ 5.11 Problem Sets _ DSE210x Courseware _ edX.html ├── Programming Assignment _ 5.12 Programming Assignment _ DSE210x Courseware _ edX.html ├── Quiz 5 _ 5.10 Comprehension Quiz _ DSE210x Courseware _ edX.html ├── Week_5_Part_1.pdf ├── Week_5_Part_2.pdf └── dice_HW.ipynb ├── Week 6 Discrete and Continuous Distribution ├── 1 Distribution Families.rtf ├── 10 Exponential Distribution.rtf ├── 11 Normal Distribution.rtf ├── 12 Gaussian Probability.rtf ├── 2 Bernoulli.rtf ├── 3 Binomial Distribution.rtf ├── 4 Poisson.rtf ├── 5 Geometric.rtf ├── 6 Examples.rtf ├── 6_conditional_probability_hw.ipynb ├── 7 Continuous Distributions.rtf ├── 8 Functions of Random Variables.rtf ├── 9 Uniform Distribution.rtf ├── Problem Set 6 _ 6.12 Problem Sets _ DSE210x Courseware _ edX.html ├── Problem Set 6 _ 6.12 Problem Sets _ DSE210x Courseware _ edX.pdf ├── Programming Assignment _ 6.13 Programming Assignment _ DSE210x Courseware _ edX.html ├── Programming Assignment _ 6.13 Programming Assignment _ DSE210x Courseware _ edX.pdf ├── Quiz 6 _ 6.11 Comprehension Quiz _ DSE210x Courseware _ edX.html ├── Quiz 6 _ 6.11 Comprehension Quiz _ DSE210x Courseware _ edX.pdf ├── Week6_Continuous_Probability.ipynb ├── Week6_Discrete_Probability.ipynb ├── Week_6_Part_1.pdf ├── Week_6_Part_2.pdf ├── Week_6_Part_3.pdf └── Week_6_Part_4.pdf ├── Week 7 Inequalities and Limit Theorems ├── 1 Markov's Inequality.rtf ├── 10_inequalities.ipynb ├── 2 Chebyshev's Inequality.rtf ├── 3 Weak Law of Large Numbers.rtf ├── 4 Moment Generating.rtf ├── 5 Chernoff.rtf ├── 6 Theorem.rtf ├── 7 Proof.rtf ├── Expectation_variance_and_covariance.ipynb ├── Problem Set 7 _ 7.9 Problem Sets _ DSE210x Courseware _ edX.html ├── Problem Set 7 _ 7.9 Problem Sets _ DSE210x Courseware _ edX.pdf ├── Programming Assignment _ 7.10 Programming Assignment _ DSE210x Courseware _ edX.html ├── Programming Assignment _ 7.10 Programming Assignment _ DSE210x Courseware _ edX.pdf ├── Quiz 7 _ 7.8 Comprehension Quiz _ DSE210x Courseware _ edX.html ├── Quiz 7.pdf └── inequalities_HW.ipynb ├── Week 8 Statistics and Parameter Estimation ├── 1. Stats.rtf ├── 11_Statistics_lecture_notebook_.ipynb ├── 2. Mean and Variance.rtf ├── 4. Unbiased Estimation.rtf ├── 5. Standard Deviation.rtf ├── HW_8.ipynb ├── Problem Set 8.pdf ├── Programming Assignment 8.pdf ├── Quiz 8.pdf ├── Week_8_Part_1.pdf └── Week_8_Part_2.pdf └── Week 9 Regression and PCA ├── 1. Review of Linear Algebra.rtf ├── 2. Matrices Notations and Operations.rtf ├── 3. Solving a System of Linear Equations.rtf ├── 4. Linear Regression.rtf ├── 5. Polynomial Regression.rtf ├── 6. Regression Towards the Mean.rtf ├── 7. Components Analysis.rtf ├── HW_9.zip ├── Programming Assignment.pdf ├── Quiz 9.pdf ├── lectures.zip └── more_lectures.zip /LICENSE.md: -------------------------------------------------------------------------------- 1 | © 2012–2018 edX Inc. All rights reserved except where noted. 2 | EdX, Open edX and the edX and Open edX logos are registered trademarks or trademarks of edX Inc. | 粤ICP备17044299号-2 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DSE210x-Statistics-and-Probability-in-Data-Science-using-Python 2 | UCSanDiegoX edX Course DSE210x Statistics and Probability in Data Science using Python 3 | https://courses.edx.org/courses/course-v1:UCSanDiegoX+DSE210x+3T2017/course/ 4 | 5 | Welcome to Statistics and Probability in Data Science using Python! 6 | We are delighted to welcome you to Statistics and Probability in Data Science using Python. 7 | In this course, you will learn the motivation, intuition, and theory behind the probabilistic 8 | and statistical foundations of data science, and will get to experiment and practice with these concepts 9 | via Python programs and the Jupyter Notebook platform. 10 | 11 | Course Staff 12 | Instructors 13 | Alon Orlitsky, Professor, ECE and CSE Departments, UC San Diego 14 | Yoav Freund, Professor, CSE Department, UC San Diego 15 | 16 | Teaching Assistants 17 | Matthew Elliot, Graduate Student, CSE, UC San Diego 18 | Rohit Parasnis, Graduate Student, ECE, UC San Diego 19 | Hanwen Yao, Graduate Student, ECE, UC San Diego 20 | Zhen Zhai, Graduate Student, CSE, UC San Diego 21 | 22 | What do you need to know to succeed? 23 | The course is intended for learners with an undergraduate degree or senior undergraduates 24 | interested in broadening their understanding of probability and statistics. 25 | We will assume basic knowledge of the following topics 26 | 27 | Logic (e.g., De Morgan’s Laws) 28 | Set theory (e.g., what are functions) 29 | Calculus (e.g., calculating integrals and derivatives) 30 | Programming (e.g., basic experience with any programming language) 31 | Linear algebra (e.g., vectors and matrices) 32 | The Python programming language will be used throughout the course. 33 | If you would like to learn or gain more practice with Python, please consider 34 | viewing or taking the first course in this MicroMasters, Python for Data Science. 35 | 36 | Overview 37 | The course will cover the following topics: 38 | 39 | Counting and combinatorics 40 | Discrete and continuous probability 41 | Conditional probability and Bayes’ Rule 42 | Random variables 43 | Expectation, variance, and correlation 44 | Common distribution families 45 | Probabilistic inequalities and concentration 46 | Moments and limit theorems 47 | Hypothesis testing 48 | Sampling and confidence intervals 49 | PCA and regression 50 | Entropy and compression 51 | Learning Objectives 52 | The course will teach you how to visualize, understand, and reason about probabilistic 53 | and statistical concepts, and how to apply your knowledge to analyze data sets and draw 54 | meaningful conclusions from data. We will cover both theoretical and practical aspects, 55 | and will start each topic with motivation and intuition and will proceed with rigorous 56 | arguments and provable techniques. Each topic will be accompanied by a Python Notebook 57 | that you could run and modify to experiment with the material learned and get a better 58 | feel for the material covered. 59 | 60 | Course Outline 61 | The course consists of 10 units. In each of the course’s first 10 weeks we will release 62 | one unit, and you will have six weeks to complete it. 63 | 64 | ● Week 1 - Introduction 65 | 66 | ● Week 2 - Sets 67 | 68 | ● Week 3 - Counting and Combinatorics 69 | 70 | ● Week 4 - Probability and Conditioning 71 | 72 | ● Week 5 - Random Variables, Expectation, and Variance 73 | 74 | ● Week 6 - Discrete and Continuous Distribution Families 75 | 76 | ● Week 7 - Inequalities and Concentration Theorems 77 | 78 | ● Week 8 - Sampling, Confidence Intervals, and Hypothesis Testing 79 | 80 | ● Week 9 - Regression and Principal Component Analysis 81 | 82 | ● Week 10 - Entropy and Compression 83 | -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/1.1 Introduction to Statistics and Probability.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - [Yoav] Hi, my name is Yoav Fruend,\par 4 | and this is Introduction to Probability and Statistics.\par 5 | This is the first video,\par 6 | therefore it is the introduction\par 7 | to Introduction to Probability and Statistics.\par 8 | So we're going to talk about probability.\par 9 | That is, roughly speaking, about tossing dice.\par 10 | And we're going to talk about statistics,\par 11 | which is, roughly speaking,\par 12 | about keeping scores in baseball.\par 13 | So why should you care about probability and statistics?\par 14 | Basically, it's because this is a very powerful tool\par 15 | for dealing with uncertainty.\par 16 | So consider the example here,\par 17 | that we have Google trying to give us a good route\par 18 | from the point A to the point B.\par 19 | There are two routes that are shown here.\par 20 | One is the shortest route,\par 21 | and the other is the fastest route.\par 22 | So if we think about shortest route from A to B,\par 23 | that is a certainty.\par 24 | Once we know how the roads are constructed\par 25 | that is a certain thing.\par 26 | On the other hand,\par 27 | in terms of the fastest route from A to B,\par 28 | that depends on traffic, on other conditions,\par 29 | and that really is something\par 30 | that we have a lot of uncertainty about.\par 31 | Therefore, we need statistics\par 32 | to help us deal with that uncertainty.\par 33 | Here's another example.\par 34 | Search Engines.\par 35 | Suppose you have a search engine\par 36 | and you're looking for some information.\par 37 | The first kind of query that you can make\par 38 | that is a certainty query,\par 39 | is to find all the web pages\par 40 | that contain the words Trump, or Hillary, and debate. Okay?\par 41 | So that is basically a very specific condition\par 42 | and you can ask your search engine\par 43 | to look for all of those pages.\par 44 | On the other hand, it might be more relevant\par 45 | to ask for the ten most relevant pages for the query\par 46 | "Trump and Hillary debate." Okay?\par 47 | So that is not a query\par 48 | that has a specific set of well defined answers.\par 49 | It has to do with what words appear in those kind of things\par 50 | and with what pages are really most relevant,\par 51 | most up to date,\par 52 | various things with which we have uncertainty.\par 53 | The last example is about an insurance company.\par 54 | So with an insurance company,\par 55 | you have a contract,\par 56 | and it says with certainty,\par 57 | that if you have life insurance\par 58 | with this company and you die,\par 59 | then the insurance company has to pay your family\par 60 | some prescribed amount of dollars.\par 61 | So that's a certainty.\par 62 | On the other hand,\par 63 | the insurance company itself\par 64 | has to deal with a lot of uncertainty.\par 65 | It doesn't know which people are going to die.\par 66 | So it has to figure out,\par 67 | what is the minimum life insurance premium\par 68 | such that the probability\par 69 | that life insurance company\par 70 | will go bankrupt in 10 years\par 71 | is smaller than 1%.\par 72 | Probably much, much smaller than that.\par 73 | In any case, what the company needs to somehow deal with,\par 74 | is the uncertainty of how many people will die\par 75 | that have insurance,\par 76 | and how much they will have to pay them. Okay?\par 77 | So that's a case we need to deal with uncertainty.\par 78 | So what are you going to learn in this course?\par 79 | First of all,\par 80 | navigation and search engine that I showed you,\par 81 | those are very advanced problems,\par 82 | as is the life insurance market.\par 83 | What you will learn here are the foundations\par 84 | that these kind of methods are based on.\par 85 | So you'll solve basic problems\par 86 | of reasoning under uncertainty.\par 87 | So as an example,\par 88 | you will know how to answer a question of the type;\par 89 | If you flip a coin 100 times,\par 90 | what is the probability of getting at most 10 heads?\par 91 | Or, what is the probability\par 92 | of getting a four of kind hand in poker?\par 93 | These are questions that you would be able to answer.\par 94 | And if you're interested in computer science examples,\par 95 | here are some other questions you might be able to answer\par 96 | which is, suppose you have a hash table\par 97 | with a million elements,\par 98 | and you don't want to allow more than five indirections\par 99 | at most for 10 elements,\par 100 | so how big does the table need to be?\par 101 | That's a calculation that you will be able to do.\par 102 | A similar one is,\par 103 | suppose that you have a router\par 104 | and that the router fails from time to time.\par 105 | And the rates of failure is once a year.\par 106 | So that gives you some information,\par 107 | but you would want to know,\par 108 | specifically let's say,\par 109 | what's the probability that it will fail\par 110 | during the first month?\par 111 | Given that on average it fails once per year.\par 112 | So that's a question that you'll be able to answer.\par 113 | So it's not universal, the belief in statistics.\par 114 | Here is an example from a basketball coach\par 115 | who does not believe in statistics\par 116 | because there are too many other factors and so on.\par 117 | So there are many people\par 118 | that basically don't want to trust statistics.\par 119 | And that is fine.\par 120 | But on the other hand,\par 121 | you can say many other people\par 122 | dealing with the same domain, here, basketball,\par 123 | do trust statistics.\par 124 | So this is an app for fans of basketball\par 125 | to have statistics about different players\par 126 | and so they can do fantasy basketball\par 127 | and win a lot of money.\par 128 | So to summarize,\par 129 | what we have all around us\par 130 | when we are doing anything in the world,\par 131 | is uncertainty.\par 132 | And probability and statistics provide a rational way\par 133 | to deal with uncertainty.\par 134 | So what we're going to discuss next is,\par 135 | what is probability, and then what is statistics.\par 136 | So I'll see you then.\par 137 | } 138 | -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/1.2 What is Probability Theory.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hi, welcome to the course.\par 4 | In this course, there are two\par 5 | main subjects that we will study.\par 6 | One is probability and the other is statistics.\par 7 | These are very related subjects,\par 8 | but they are still different, so let's start\par 9 | by thinking about what is probability.\par 10 | So probability is the mathematical framework\par 11 | for computing probabilities of complex events.\par 12 | That's a mouthful, and we make an assumption\par 13 | that we know the probability of the basic event.\par 14 | What precisely do we mean by probability and by event?\par 15 | Those will be defined later in the class.\par 16 | For now, let's just think in terms of common sense.\par 17 | So let's think about a simple question.\par 18 | We flip a coin and we get tails.\par 19 | We flip it again, we get heads.\par 20 | Okay, what we believe somehow is that\par 21 | the probabilities are equal, but\par 22 | what do we really mean by that?\par 23 | What does that mean, does it mean that\par 24 | we'll exactly get the same number of heads and tails?\par 25 | No, it just means that ...\par 26 | if we flip the coin many times,\par 27 | then for some very large number of coins that we flip,\par 28 | let's say ten thousand, we'll get\par 29 | the number of heads is approximately or about five thousand.\par 30 | Okay, that's what we expect, but\par 31 | what do we mean by about, how can we,\par 32 | how can we express this notion of about\par 33 | in a better way because we might be\par 34 | actually interested in knowing how far from that we are.\par 35 | Okay, so we're going to simulate coin flips.\par 36 | We'll use a pseudo random number generator\par 37 | to simulate coin flips, and instead of heads and tails\par 38 | it'll be more convenient to use one and minus one,\par 39 | and then the number of heads relates to\par 40 | summing all of these plus ones and minus ones\par 41 | and what we expect is that the sum\par 42 | will be zero or close to zero.\par 43 | So we will vary the number of\par 44 | coins flips which we denote by k\par 45 | and here is a little bit of code\par 46 | for generating such random coin flips.\par 47 | Here we're generating the coin flips themselves\par 48 | and here we're summing the coin flips\par 49 | along a particular sequence.\par 50 | We're generating many sequences at once,\par 51 | and this is the number n that we\par 52 | say here is by default one hundred.\par 53 | So this is a central part of the code,\par 54 | but I'm not going to show you all of the code,\par 55 | for, to see that, you have to download\par 56 | the notebook yourself and play with it to see the details.\par 57 | So here is a histogram that shows for\par 58 | flipping a coin a thousand times\par 59 | what is the distribution of this sum, that we said.\par 60 | Sum is about zero, but then you see it's not exactly zero,\par 61 | and every time that I rerun this experiment,\par 62 | every time that I rerun the experiment,\par 63 | what you see is that the histogram\par 64 | that you get is slightly different.\par 65 | However, even though it is always\par 66 | each time slightly different, there is\par 67 | something very much in common for all of these coin flips.\par 68 | They're all concentrated around zero,\par 69 | but they're not exactly zero, and for this\par 70 | number of coin flips, one thousand,\par 71 | it is extremely unlikely that they're\par 72 | below two hundred and fifty or above\par 73 | minus two hundred and fifty and above two hundred and fifty.\par 74 | Okay, so with probability theory,\par 75 | we can calculate how small we expect Sk, the sum, to be.\par 76 | The absolute value of the sum, so it can be\par 77 | either negative or positive, and what we show,\par 78 | we will show, is that the probability\par 79 | that this Sk is larger than four times\par 80 | the square root of k is extremely small.\par 81 | It is two times ten to the minus eight,\par 82 | or 0.000002 percent,\par 83 | so we'll have to flip\par 84 | the sequence of one thousand coins\par 85 | many many many times before we can see\par 86 | that it will be bigger than four square root of k.\par 87 | So let's actually do the simulation\par 88 | and see if that is the case.\par 89 | Okay, so here is our simulation.\par 90 | What we see is, here, we have one hundred coin flips.\par 91 | Here, one thousand coin flips,\par 92 | and here ten thousand coin flips,\par 93 | and the red line mark what probability theory says\par 94 | is the boundary in which it is very, very likely\par 95 | that the total number of coin flips resides.\par 96 | So I can rerun this experiment too.\par 97 | And you see that again, each time\par 98 | the distribution is somewhat different\par 99 | but it never goes outside of the red bar.\par 100 | So that's consistent with what we said.\par 101 | Now, here it seems that all of them are very similar.\par 102 | It doesn't really matter if you do\par 103 | one hundred, one thousand, or ten thousand coin flips,\par 104 | but that's really because I'm scaling it\par 105 | according to this boundary, so the boundary\par 106 | here is minus 40 to 40, here it's\par 107 | minus one hundred and something to one hundred and something\par 108 | and here it's minus four hundred to four hundred.\par 109 | If we scale, if we plot the full scale\par 110 | of these coin flips, what we see is the following.\par 111 | We see something like this, so when\par 112 | we plot the whole scale from\par 113 | minus one hundred to one hundred, for hundred coins,\par 114 | and from minus ten thousand to ten thousand\par 115 | for ten thousand coins, then we see\par 116 | that the distribution becomes more and more concentrated\par 117 | around zero, relative to this scale.\par 118 | So if I run it again ...\par 119 | Again you get each time a different distribution\par 120 | but you get that the distribution\par 121 | is more and more concentrated if you\par 122 | flip the coin more and more times\par 123 | and the width of this is square root of k,\par 124 | four times two times square of k, so you see\par 125 | the more coin flips you, more times you flip the coin,\par 126 | the closer it is relatively to the range to zero.\par 127 | Okay, so let's summarize.\par 128 | We did some experiments where we summed k random numbers\par 129 | that are correspond to coin flips\par 130 | with probability xi, so we had xi minus one or plus one\par 131 | with probability is half and half.\par 132 | And our experiments show that the sum\par 133 | is almost always in the range\par 134 | minus four square root of k to plus four square root of k.\par 135 | Okay, so we can write it this way.\par 136 | If k goes to infinity, we have 4 square root of k\par 137 | as the range, divided by k, so that is\par 138 | four divided by the square root of k,\par 139 | which is equal to zero, which goes to zero as k increases.\par 140 | And so what we can say is that Sk, relative to k,\par 141 | so the ratio of the number, of the difference\par 142 | between heads and tails, divided by k, that goes to zero.\par 143 | And that's basically what we mean\par 144 | by the probability is being half and half.\par 145 | Okay so again, what is probability theory?\par 146 | It's the math involving proving in a precise way\par 147 | the statements that we made above.\par 148 | Okay, so before, we just kind of did simulations\par 149 | and alluded to something that will prove in the future\par 150 | but that's really what probability theory is,\par 151 | is proving these in a precise way.\par 152 | In most cases, we can approximate the output,\par 153 | these probabilities, using simulations.\par 154 | These are called Monte-Carlo simulations,\par 155 | and that's essentially what we did\par 156 | in this little experiment that we did.\par 157 | So why isn't that enough?\par 158 | Because, first of all, calculating\par 159 | the probability gives you a precise answer,\par 160 | and doing Monte-Carlo simulations\par 161 | just gives you an approximation\par 162 | and you need to run the experiment longer and longer\par 163 | to get more and more accurate answers.\par 164 | And the second is that it is much\par 165 | faster than Monte-Carlo simulations,\par 166 | essentially for the same reasons.\par 167 | Okay, so that is a quick description of what is probability\par 168 | and next time, we're going to talk about what is statistics.\par 169 | See you there.\par 170 | End of transcript. Skip to the start.\par 171 | POLL\par 172 | } 173 | -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/1.3 What is Statistics.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Okay.\par 4 | So, in the previous video, we talked\par 5 | about what is probability,\par 6 | and this time we're going to talk\par 7 | about what is statistics?\par 8 | So, in probability theory,\par 9 | we compute probabilities of complex events,\par 10 | from the underlying base distribution.\par 11 | Statistics takes us in the opposite direction.\par 12 | We're given data that was generated\par 13 | by some Stochastic process, or some random process,\par 14 | and from that data we infer properties\par 15 | of this Stochastic process.\par 16 | So here is an example.\par 17 | Again, let's go back to our coin.\par 18 | Here's our coin.\par 19 | And, we believe that it is an unbiased coin.\par 20 | So, it gives us exactly half heads and half tails.\par 21 | But how can we be sure?\par 22 | So, previous time we saw that,\par 23 | we talked about the distribution,\par 24 | and now we wanna turn the question around.\par 25 | If we flip the coin 1000 times,\par 26 | and we get 570 heads,\par 27 | then can we conclude that the coin is biased?\par 28 | So, the coin is not a fair coin?\par 29 | What about, can we conclude that the coin is biased?\par 30 | Not a fair coin?\par 31 | What can we conclude if the outcome is 507 heads?\par 32 | It's still not exactly 500, but closer to it.\par 33 | So how can we decide whether the coin\par 34 | is biased or not?\par 35 | So, here is the logic of statistical inference.\par 36 | We wanna say something about the coin,\par 37 | and we use the following kind of logic.\par 38 | Let's say that the coin is fair, okay?\par 39 | And then we can calculate\par 40 | what is the probability that the coin\par 41 | will give us 570 coins.\par 42 | If this probability is extremely small,\par 43 | then we can reject with confidence\par 44 | the hypothesis that the coin is fair.\par 45 | We can say it is very unlikely\par 46 | that a fair coin would generate this sequence,\par 47 | and therefore it is not a fair coin.\par 48 | So, let's see how we can calculate the answer.\par 49 | Recall the simulations we did,\par 50 | with the video, What is Probability.\par 51 | We used xi minus one, xi plus one,\par 52 | instead of heads and tails,\par 53 | and we looked at the sum of all of these variables.\par 54 | And, if the number of heads is 570,\par 55 | then we can easily see that the sum\par 56 | over 1000 plus ones and minus ones,\par 57 | will give us 140.\par 58 | Now, we know, we haven't really shown it yet,\par 59 | but we already, I already stated it\par 60 | in the previous slide,\par 61 | that it's very unlikely that this sum,\par 62 | the absolute value of it,\par 63 | is larger than four square root of k,\par 64 | which in this case is 126.5.\par 65 | So, this is how I calculated this 126.5.\par 66 | And, therefore it is very unlikely\par 67 | that the coin is unbiased, right?\par 68 | We can say the coin is probably biased.\par 69 | Because, it is very unlikely that an unbiased coin\par 70 | would generate this outcome.\par 71 | What about 507 heads?\par 72 | Well, if you have 507 heads, 493 tails,\par 73 | then Sn is going to be 14,\par 74 | and 14 is much, much smaller than 126.5.\par 75 | So at least according to the rule we just had,\par 76 | we cannot say that the probability is very small,\par 77 | and as we'll see, the probability\par 78 | is actually quite reasonable,\par 79 | that it's quite large, that a fair coin\par 80 | would generate 507 heads.\par 81 | So, we cannot conclude that the coin is biased.\par 82 | It might still be biased, but we might have to\par 83 | flip the coin many, many, many more times,\par 84 | before we can deduce that.\par 85 | Okay, so, as a conclusion,\par 86 | the probability that unbiased coin\par 87 | would generate a sequence\par 88 | with 570 or more heads is extremely small,\par 89 | and from that we can conclude, with high confidence,\par 90 | that the coin is biased.\par 91 | On the other hand, if this sum is larger than 507,\par 92 | that is quite likely.\par 93 | And so getting 507 heads does not provide us\par 94 | with evidence that the coin is biased.\par 95 | Let's think about some real world examples.\par 96 | So statistics, unlike probability\par 97 | which is a part of math,\par 98 | statistics is really about problems in the real world.\par 99 | And so you might ask,\par 100 | "Why should I care about the coin being biased?\par 101 | That's not a problem I kind of face many times."\par 102 | And that's a very valid critique.\par 103 | And we will now give some examples\par 104 | for real problems where the statistics\par 105 | is very closely related to whether or not a coin is biased.\par 106 | So, let's take polls, for instance.\par 107 | Suppose there is elections,\par 108 | that are going to take place in a few days,\par 109 | and we want to know how people plan to vote.\par 110 | Suppose there are just two parties: D and R.\par 111 | We could try and ask all of the potential voters\par 112 | what they plan to vote,\par 113 | but that would be extremely expensive.\par 114 | We cannot really afford to do that.\par 115 | So instead, we can use a poll.\par 116 | What is a poll?\par 117 | We call a small randomly selected set of people,\par 118 | ask them of their opinions,\par 119 | and then we extrapolate from that\par 120 | what do people think in general.\par 121 | So, call n people at random,\par 122 | and count the number of D votes.\par 123 | And the question is, can you say with confidence\par 124 | that there are more D votes, or more R votes?\par 125 | Mathematically, this is exactly equivalent\par 126 | to flipping a biased coin, and asking\par 127 | whether heads is more likely than tails.\par 128 | Or tails more likely than heads.\par 129 | It's the exact same question,\par 130 | and the same math holds for it.\par 131 | Here's another case.\par 132 | This is called A/B testing,\par 133 | which is a very common practice now,\par 134 | on developing web interfaces.\par 135 | You basically think about two alternative designs\par 136 | for your web page, one is A, one is B.\par 137 | And the only difference in this case\par 138 | is whether this bar is to the left,\par 139 | or to the right of the screen, okay?\par 140 | So, to see which design users prefer,\par 141 | we randomly present the design A or design B,\par 142 | when people visit our website.\par 143 | And we measure how long the user stays on the page,\par 144 | or whether the user clicked on an advertisement,\par 145 | or any other indication that the user\par 146 | likes one of the designs more than the other.\par 147 | We want to decide with some confidence,\par 148 | which of the two designs is better.\par 149 | Again, this is very similar\par 150 | to making the decision with confidence\par 151 | on whether head is more probable than tails,\par 152 | or vice versa.\par 153 | Okay, so to summarize, statistics is about\par 154 | taking data from some real-world process,\par 155 | and drawing conclusions about this process\par 156 | from the data you collected.\par 157 | And, we talked about several examples.\par 158 | One is using polls to estimate public opinion.\par 159 | Another is performing A/B tests to design web pages.\par 160 | And, of course there are many, many others\par 161 | that are all around us.\par 162 | One is estimating the rate of global warming.\par 163 | Another is deciding\par 164 | whether a medical procedure is effective.\par 165 | So these are slightly more complicated,\par 166 | and they don't map directly\par 167 | to flipping a biased coin,\par 168 | but they are of a similar type.\par 169 | So, this ends the description of probability and statistics.\par 170 | And we are now going to start diving into\par 171 | the details of how to do probability.\par 172 | Thank you.\par 173 | End of transcript. Skip to the start.\par 174 | POLL\par 175 | } 176 | -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/1.4 A Puzzle.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - In a previous video, I give you a short explanation\par 4 | for what is probability.\par 5 | And you might wonder,\par 6 | "Why do I really need that, all this math?\par 7 | "Does it really help me in real world situations?"\par 8 | So I'm going to give you here a little puzzle,\par 9 | it's a very natural puzzle, it's not something contrived,\par 10 | and I'm going to ask you to think about\par 11 | how you would want to answer this puzzle, okay?\par 12 | So it's called the three card puzzle.\par 13 | And what I want you to imagine\par 14 | is that you have three cards in a hat, okay?\par 15 | So the cards are, one card is red on one side,\par 16 | and blue on the other side, one card is blue on both sides,\par 17 | and one card is red on both sides.\par 18 | So I have those cards here.\par 19 | Let me show you.\par 20 | Here is the card that has red on one side,\par 21 | blue on the other side.\par 22 | This is the card that has red on both sides.\par 23 | And this is the card that has blue on both sides, okay?\par 24 | Simple enough.\par 25 | So we have these cards in the hat, and we mix them,\par 26 | we mix the cards, and do the following.\par 27 | We pick a card at random, okay?\par 28 | And we put it on the table, all right?\par 29 | So here we have a blue card on the table.\par 30 | And the color of the side\par 31 | that is facing up is, I call it U.\par 32 | It's B or R, okay?\par 33 | So here's the bet that I'm suggesting we make.\par 34 | If the other side of the card has a different color,\par 35 | I will pay you $1.\par 36 | If the other side has the same color, you pay me $1, okay?\par 37 | And I think that this is fair.\par 38 | Why do I think it's fair?\par 39 | Because, suppose it's red, actually it's blue.\par 40 | So what can this card be?\par 41 | It can be either the blue on both sides\par 42 | or the blue and red, okay?\par 43 | So there are two options, one is with the other side\par 44 | being red and one is with the other side being blue,\par 45 | so they both have the same probability of happening,\par 46 | so having a bet of one to one is a fair one, okay?\par 47 | So let's see this one.\par 48 | Yeah, actually I got red, so I pay you $1.\par 49 | So to decide what is the probability of winning a dollar\par 50 | or losing a dollar, let's use a Monte Carlo simulation\par 51 | like we did earlier, okay?\par 52 | So we're going to have a little program that generates\par 53 | at random an order for the cards, chooses one of the cards,\par 54 | and then chooses a side for the card, and then prints out\par 55 | the card and the outcome of whether which side wins, okay?\par 56 | So here is our, the outcome of our Monte Carlo simulation,\par 57 | you can actually rerun it.\par 58 | So each time I'm getting slightly different outcomes.\par 59 | But if you look at the numbers that come out down here,\par 60 | it shows that different is 17 times, and same is 33 times.\par 61 | So clearly, different happens\par 62 | much fewer times than same, okay?\par 63 | So I basically have here a game that is,\par 64 | even as simple as it is, it's going to, it's unfair to you\par 65 | and I'm going to gain money on average from playing it.\par 66 | Okay, so as we saw, the simulation does not agree\par 67 | with our argument we made before.\par 68 | The argument must be false.\par 69 | In the simulation, the two sides have the same color\par 70 | about twice the number of times\par 71 | that they have different colors.\par 72 | So they have about twice the probability.\par 73 | So what does that mean?\par 74 | It means that you, if you play this game, you are twice\par 75 | as likely to lose a dollar as you are to win.\par 76 | And because of that, on average, you are going to lose\par 77 | 33 cents per iteration because you lose $1\par 78 | with probability 2/3, and you gain a dollar\par 79 | with probability 1/3.\par 80 | So you're losing 33 cents on each iteration.\par 81 | So here's an alternative argument.\par 82 | Okay, so if we pick card at random, then 2/3 of the time\par 83 | we pick a card where the two sides have the same color,\par 84 | and only 1/3 where the color is different.\par 85 | So that basically explains what we saw.\par 86 | So supposedly now we understand the game, but the problem is\par 87 | the original argument also sounds convincing,\par 88 | but it is wrong.\par 89 | So how can we distinguish between this argument\par 90 | and another argument and say which one is right\par 91 | without doing a simulation?\par 92 | Okay, simulation is fine, but sometimes, as you saw already,\par 93 | running a simulation to answer a probability question\par 94 | is hard, you have to run for a long, long time,\par 95 | and you get only an approximate result, okay?\par 96 | So, to be sure that we need,\par 97 | the argument is correct,\par 98 | we need more formalism.\par 99 | That's the point of this video.\par 100 | We need concepts like outcome and event,\par 101 | and those are things that we're going\par 102 | to start talking about next week.\par 103 | End of transcript. Skip to the start.\par 104 | POLL\par 105 | } 106 | -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/1.5 A poster.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hi, last video\par 4 | we talked a little bit about the history\par 5 | of probability and statistics.\par 6 | And I gave you some pointers\par 7 | to the main ideas that were there.\par 8 | Now, I would like to put these things together\par 9 | with the timeline so that you can\par 10 | have a better sense about how probability\par 11 | and statistics developed over the years.\par 12 | So let's look at this poster.\par 13 | This is a short history of probability and statistics\par 14 | and much of it is based on this book\par 15 | by Ian Hacking, The Emergence of Probability.\par 16 | So if you're interested in more\par 17 | you can go read this book.\par 18 | It's an excellent book.\par 19 | As we said in the last video,\par 20 | there are two parts to\par 21 | probability and statistics.\par 22 | The two threads are\par 23 | repeated games of chance, on the one hand,\par 24 | and the strength of evidence\par 25 | and degrees of belief on the other.\par 26 | That's the two arrows, the red one going down\par 27 | and the green one going down,\par 28 | and time goes in this direction,\par 29 | so this is as time progresses.\par 30 | And statistics and probability\par 31 | in its modern form is pretty much agreed to be\par 32 | starting at the time of\par 33 | Pascal and Fermat\par 34 | that were two mathematicians in 1654.\par 35 | In these correspondences, the main ideas\par 36 | were laid out, of course, there were\par 37 | other people involved, but the main\par 38 | interesting thing is around 1650\par 39 | is when modern mathematical probability and statistics\par 40 | started to be developed.\par 41 | Okay, so that's the timeline that we have\par 42 | with the blue line.\par 43 | All right, so let's look a little bit\par 44 | from what happened before that point.\par 45 | Before that point, you had repeated games of chance,\par 46 | so those were games played with different things,\par 47 | like knucklebones\par 48 | and dice and cards.\par 49 | And those raised questions of the type\par 50 | of what is the right way to split\par 51 | the money when you stop a game early?\par 52 | So that is the part of statistics that we will\par 53 | actually deal with quite a lot in the beginning,\par 54 | which has to do with games of chance.\par 55 | The other part that is much less well-defined,\par 56 | but probably even more important is what do we do\par 57 | when we have a state of uncertainty?\par 58 | We have some evidence towards some conclusions\par 59 | but we are not sure how to weigh different\par 60 | evidences that might be contradictory.\par 61 | So these kinds of things come up in law.\par 62 | So here is the law.\par 63 | It comes up in medicine\par 64 | and it comes up in science\par 65 | and later on, technology.\par 66 | Basically, in modern science and technology,\par 67 | probability and statistics are\par 68 | a necessary part.\par 69 | Now, in public policy, it's also a necessary part.\par 70 | So those things existed from before\par 71 | and in these correspondences,\par 72 | Pascal and Fermat also related to them.\par 73 | But it's important to remember that these two things\par 74 | are quite different from each other.\par 75 | One is about evidence and about how people think\par 76 | about evidence and the other is much more mechanical.\par 77 | So it has to do with rolling dice and so on.\par 78 | So of course, the rolling of dice\par 79 | did not stop at that point.\par 80 | We have casinos, also, now.\par 81 | And so these questions are natural,\par 82 | and these questions give rise to the frequentist approach\par 83 | to probability and statistics that was\par 84 | described in the other video.\par 85 | And the best known champion of Frequentist Statistics\par 86 | is Andre Kolmogorov, one of the great\par 87 | mathematicians from Russia,\par 88 | and he invented what's called the axiom of probability.\par 89 | So he was central to this view.\par 90 | And in the more recent, current,\par 91 | still alive is Vladimir Vapnik,\par 92 | who has developed some of the foundations\par 93 | for machine learning.\par 94 | Okay, so this is about the frequentist.\par 95 | Now, in the other direction,\par 96 | in the side of evidence and degrees of belief,\par 97 | there was a different line of development,\par 98 | which is called Bayesian Statistics,\par 99 | and we will talk also about that in a later time,\par 100 | in which you take your belief before you see\par 101 | the evidence and you update them when you see the evidence\par 102 | and the champion of that was Bruno De-finetti.\par 103 | Okay, so you have\par 104 | on the one side this Bayesian Statistics approach\par 105 | and on the other side, the Frequentist approach\par 106 | and there's definitely a tension between the two.\par 107 | So this is a pretty famous picture by now\par 108 | of Vapnik standing around next to a board\par 109 | and in the board it says, "All your Bayes\par 110 | "are belong to us."\par 111 | So this is a clear slight of Bayesian Statistics.\par 112 | All right, but then what develops over time\par 113 | is people that are statistics practitioners,\par 114 | people that actually use probability and statistics\par 115 | in order to solve real world problems\par 116 | and I draw them in the middle here,\par 117 | the practitioners, because they are,\par 118 | in general, not dogmatic to one side or the other.\par 119 | They would use Bayesian Statistics when it's appropriate,\par 120 | Frequentist Statistics when it's appropriate\par 121 | and other heuristics when that's appropriate.\par 122 | Okay, so the father of those methods\par 123 | is Ronald Fisher, who has brought statistics\par 124 | to the sciences and also to the social sciences.\par 125 | And then more recent ones are\par 126 | John Tukey and even more recent\par 127 | is Leo Breiman, the inventor\par 128 | of bagging and cart and other important algorithms.\par 129 | So, just to add a little bit,\par 130 | there's this area that we will also talk about\par 131 | called hypothesis testing and P-values,\par 132 | which is the Frequentist approach to arguing\par 133 | about degrees of belief.\par 134 | So it's an interesting contrast\par 135 | and this actual approach is now very, very\par 136 | commonly used in science to accept or reject papers\par 137 | according to the strength of the evidence\par 138 | that they have.\par 139 | And as I said before, these middle ones\par 140 | are the practitioners, I drew different colored\par 141 | arrows in all kinds of ways\par 142 | because they're each unique.\par 143 | They take various ideas from probability and statistics\par 144 | and they apply it to real problems\par 145 | in their own unique way.\par 146 | So the modern version of these practitioners\par 147 | today is Machine Learning\par 148 | and even more recently, Big Data,\par 149 | when we try to apply Machine Learning\par 150 | methods to Big Data.\par 151 | So let's zoom out and to see the complete picture\par 152 | and what was important for me to show you here\par 153 | is that while the methods\par 154 | that people are more familiar today with\par 155 | are like Machine Learning, Big Data,\par 156 | and Neural Networks\par 157 | are very popular,\par 158 | there is actually a very long history\par 159 | and in this long history people developed\par 160 | a lot of very important methods\par 161 | that are worthwhile knowing about.\par 162 | So I hope that that gives you a perspective\par 163 | that will be useful for you for the rest of the course.\par 164 | Thank you very much.\par 165 | End of transcript. Skip to the start.\par 166 | Click to download the history Poster\par 167 | } 168 | -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/1.5 asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Histor.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/1.5 asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Histor.pdf -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/1.7 Week1-IntroMotivation.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/1.7 Week1-IntroMotivation.zip -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/1.9 Programming Assignment.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/1.9 Programming Assignment.pdf -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/Install Software.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Install Software.docx -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/Install Software.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Install Software.pdf -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/Quiz 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Quiz 1.pdf -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/AB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/AB.png -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/Hitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/Hitting.png -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/Scorecard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/Week1-IntroMotivation/images/Scorecard.jpg -------------------------------------------------------------------------------- /Week 1 Introduction to Statistics and Probability/asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Week_1_sid.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 1 Introduction to Statistics and Probability/asset-v1_UCSanDiegoX+DSE210x+3T2017+type@asset+block@Week_1_sid.pdf -------------------------------------------------------------------------------- /Week 10 Confidence Intervals and Hypothesis Testing/2.1. Sigma Unknown Example.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - [Narrator] So, let's say that you wanted\par 4 | to find the average length\par 5 | of an elephant's trunk,\par 6 | in this case mature African elephant trunks.\par 7 | So you want to do that, then what you would do,\par 8 | you would go out to the field and take some measurements\par 9 | of some trunks and, let's say that you took\par 10 | eight measurements, you found eight elephants,\par 11 | you took eight measurements.\par 12 | Let's say they are 5.62, 6.07 and so on,\par 13 | up to 5.48 feet.\par 14 | Okay?\par 15 | Now what you want to do is you want to find\par 16 | the confidence interval for the length of the trunk\par 17 | and let's say specifically you want to find the\par 18 | 95% confidence interval.\par 19 | So you want to find the mean such that if you\par 20 | did this experiment many times, 95% of the time\par 21 | you'll come up with an interval that includes a mean.\par 22 | So, first what you do is you find the critical t,\par 23 | and remember the formula is the critical value t,\par 24 | p with n-1 degrees of freedom is, we look at the inverse\par 25 | of the F, or the percentile point function of the\par 26 | t distribution with n-1 degrees of freedom\par 27 | evaluated 1+p over 2, where p is 95%\par 28 | so that's F7 because we have eight measurements,\par 29 | so we have seven degrees of freedom,\par 30 | inverse evaluated 0.975, and when you do that that gives you\par 31 | 2.3646 and to calculate this you use the t\par 32 | for the t-distribution ppf for the inverse of the cdf\par 33 | evaluated at 0.975, seven degrees of freedom\par 34 | gives you this value, so this is 2.3656.\par 35 | Then you calculate the sample mean,\par 36 | which is just the average of these values,\par 37 | so the sample mean in this case is 6.095.\par 38 | Then you calculate the sample variance,\par 39 | which as you remember you take each of those values,\par 40 | subtract the sample mean, square it,\par 41 | add them up and normalize by n-1,\par 42 | by seven, not by eight,\par 43 | because we're using the S correction.\par 44 | When you do that you get that S squared is 0.1705\par 45 | and therefore S, the sample standard deviation,\par 46 | is going to be 0.4130.\par 47 | Then, you calculate the margin of error,\par 48 | which is simply the critical t,\par 49 | multiplied by our estimate of the standard deviation,\par 50 | normalized by the square root of n,\par 51 | which is the number of samples that we take.\par 52 | When you do that you get, you get here,\par 53 | t, which we found to be 2.36,\par 54 | times S, which is this number,\par 55 | divided by square root of eight, is 0.3453.\par 56 | And then finally your confidence interval is just the\par 57 | sample mean plus/minus the margin of error,\par 58 | so plus/minus this value, and standard numeral 6.095,\par 59 | so that gives you a number with some values which is\par 60 | 5.7497 up to 6.4403.\par 61 | Okay, so that's all we need to do.\par 62 | And just want to end with\par 63 | just a few comments and observations.\par 64 | So first for when n is large, as we saw both using graphs\par 65 | and using analytical calculation,\par 66 | the distribution of the, t-distribution,\par 67 | converges to the standard numeral distribution,\par 68 | and therefore the critical t value,\par 69 | with n-1 degrees of freedom, as n increases is going to\par 70 | converge to the critical z value,\par 71 | evaluated set point p.\par 72 | Also note that our estimate S of the standard deviation,\par 73 | the sample standard deviation, will converge to sigma.\par 74 | When you combine these two together you see\par 75 | that you can just use z-based techniques,\par 76 | and just use S instead of sigma when you don't know sigma,\par 77 | so just use the estimate, and these two techniques will,\par 78 | in the limit, as the normal sample increases,\par 79 | will give you the same value.\par 80 | Also note that when n is small, as is often the case,\par 81 | and this was the case that Gosset was concerned with,\par 82 | then the t-distribution is more accurate,\par 83 | and notice that it will yield a larger margin of error\par 84 | than when sigma is known,\par 85 | that's because there is more uncertainty.\par 86 | And last thing I want to say is that we assume that\par 87 | Xi's are normal, if you go back in the slides,\par 88 | but in reality, of course, the samples are not necessarily\par 89 | going to be normal, so that calculations that we did\par 90 | are valued mostly when the samples\par 91 | that we take are roughly normally distributed.\par 92 | So, let's summarize.\par 93 | We looked at confidence intervals when sigma is not known,\par 94 | we said we need to estimate sigma, which is natural,\par 95 | but we also need to replace the standard normal distribution\par 96 | by the student's t-distribution, which is derived\par 97 | by William Gosset as he was working from Guinness,\par 98 | trying to improve the production of beer, as we see here,\par 99 | and we gave step-by-step instructions\par 100 | for calculating the confidence interval, and an example.\par 101 | So next time we're going to talk about hypothesis testing,\par 102 | see you then.\par 103 | End of transcript. Skip to the start.\par 104 | POLL\par 105 | } 106 | -------------------------------------------------------------------------------- /Week 10 Confidence Intervals and Hypothesis Testing/HW_10.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | " **IMPORTANT: ** When submitting this homework notebook, please modify only the cells that start with:<\/font>\n", 8 | "\n", 9 | "```python\n", 10 | "# modify this cell\n", 11 | "```" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 18, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from scipy import stats\n", 21 | "from math import sqrt" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Confidence Interval" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "For a sample with size large enough, by central limit theorem we can assume its mean follows normal distribution. And if we also know the standard deviation of the population, we are able to calculate a confidence interval to estimate the population mean." 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Problem 1" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "A confidence interval is usually given by sample mean $m$ plus and minus a margin of error $r$:\n", 50 | "$$[m-r,m+r]$$\n", 51 | "The confidence interval is larger (less precise) for large confidence level, and smaller (more precise) for small confidence level.\n", 52 | "\n", 53 | "For this problem, you are asked to write a function **Error** that calculates the margin of error $r$ given sample size $n$, confidence level $p$ and standard deviation of the population $s$." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | " **Code:**<\/font>\n", 61 | "```python\n", 62 | "Error(40,0.95,20)\n", 63 | "\n", 64 | "Error(40,0.95,10) \n", 65 | "```\n", 66 | "\n", 67 | "\n", 68 | " **Output**<\/font>\n", 69 | "```\n", 70 | "6.1979503230456148\n", 71 | "3.0989751615228074\n", 72 | "```" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 19, 78 | "metadata": { 79 | "collapsed": true, 80 | "nbgrader": { 81 | "grade": false, 82 | "locked": false, 83 | "solution": false 84 | } 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "# modify this cell\n", 89 | "\n", 90 | "def Error(n,p,s):\n", 91 | " # inputs: sample size n, confidence level p and standard deviation s\n", 92 | " # output: margin of error r\n", 93 | " \n", 94 | " #\n", 95 | " # YOUR CODE HERE\n", 96 | " #\n" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 47, 102 | "metadata": { 103 | "nbgrader": { 104 | "grade": true, 105 | "grade_id": "ex1", 106 | "locked": true, 107 | "points": "5", 108 | "solution": false 109 | } 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "# Check Function\n", 114 | "assert abs(Error(60,0.9,20)-4.2469938027546128) < 10**-5 \n", 115 | "assert abs(Error(60,0.9,10)-2.1234969013773064) < 10**-5\n", 116 | "\n", 117 | "#\n", 118 | "# AUTOGRADER TEST - DO NOT REMOVE\n", 119 | "#\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "# Problem 2" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "For this problem, you are asked to write a function **Confidence** that calculates the confidence level $p$ given sample size $n$, margin of error $r$ and standard deviation of the population $s$." 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | " **Code:**<\/font>\n", 141 | "```python\n", 142 | "Confidence(40,6,20)\n", 143 | "\n", 144 | "Confidence(40,8,20) \n", 145 | "```\n", 146 | "\n", 147 | "\n", 148 | " **Output**<\/font>\n", 149 | "```\n", 150 | "0.94222042887640267\n", 151 | "0.98858796361399826\n", 152 | "```" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 48, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "# modify this cell\n", 164 | "\n", 165 | "def Confidence(n,r,s):\n", 166 | " # inputs: sample size n, margin of error r, and standard deviation s\n", 167 | " # output: confidnce level r\n", 168 | " \n", 169 | " #\n", 170 | " # YOUR CODE HERE\n", 171 | " #\n" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 49, 177 | "metadata": { 178 | "collapsed": true, 179 | "nbgrader": { 180 | "grade": true, 181 | "grade_id": "ex2", 182 | "locked": true, 183 | "points": "5", 184 | "solution": false 185 | } 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "# Check Function\n", 190 | "assert abs(Confidence(60,5,20)-0.94719248858388649) < 10**-5 \n", 191 | "assert abs(Confidence(60,6,20)-0.97986324844965367) < 10**-5\n", 192 | "\n", 193 | "#\n", 194 | "# AUTOGRADER TEST - DO NOT REMOVE\n", 195 | "#\n" 196 | ] 197 | } 198 | ], 199 | "metadata": { 200 | "kernelspec": { 201 | "display_name": "Python 2", 202 | "language": "python", 203 | "name": "python2" 204 | }, 205 | "language_info": { 206 | "codemirror_mode": { 207 | "name": "ipython", 208 | "version": 2 209 | }, 210 | "file_extension": ".py", 211 | "mimetype": "text\/x-python", 212 | "name": "python", 213 | "nbconvert_exporter": "python", 214 | "pygments_lexer": "ipython2", 215 | "version": "2.7.12" 216 | }, 217 | "toc": { 218 | "colors": { 219 | "hover_highlight": "#DAA520", 220 | "navigate_num": "#000000", 221 | "navigate_text": "#333333", 222 | "running_highlight": "#FF0000", 223 | "selected_highlight": "#FFD700", 224 | "sidebar_border": "#EEEEEE", 225 | "wrapper_background": "#FFFFFF" 226 | }, 227 | "moveMenuLeft": true, 228 | "nav_menu": { 229 | "height": "12px", 230 | "width": "252px" 231 | }, 232 | "navigate_menu": true, 233 | "number_sections": true, 234 | "sideBar": true, 235 | "threshold": 4, 236 | "toc_cell": false, 237 | "toc_section_display": "block", 238 | "toc_window_display": false, 239 | "widenNotebook": false 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 2 244 | } -------------------------------------------------------------------------------- /Week 10 Confidence Intervals and Hypothesis Testing/Problem Set 10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Problem Set 10.pdf -------------------------------------------------------------------------------- /Week 10 Confidence Intervals and Hypothesis Testing/Programming Assignment.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Programming Assignment.pdf -------------------------------------------------------------------------------- /Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_1.pdf -------------------------------------------------------------------------------- /Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_2.pdf -------------------------------------------------------------------------------- /Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_3.pdf -------------------------------------------------------------------------------- /Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_4.pdf -------------------------------------------------------------------------------- /Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 10 Confidence Intervals and Hypothesis Testing/Week_10_Part_5.pdf -------------------------------------------------------------------------------- /Week 2 Sets/1 Week_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 2 Sets/1 Week_2.pdf -------------------------------------------------------------------------------- /Week 2 Sets/2.3 Venn Diagrams Visualizing Sets.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hello and welcome back.\par 4 | So now that we have defined and created sets,\par 5 | the next thing we're going to do is visualize them,\par 6 | and advertise them and as we all know this is\par 7 | best done using Instagram.\par 8 | Except that sets were defined the way we're\par 9 | introducing them now a couple hundred years ago,\par 10 | and at that time the hottest thing was not Instagram,\par 11 | but rather a diagram, and specifically a Venn diagram.\par 12 | And Venn diagrams are named after John Venn.\par 13 | Who besides being a mathematician and a priest,\par 14 | was also an avid cricket player and he even designed\par 15 | machines that collect cricket balls.\par 16 | And so it perhaps not so surprising that when\par 17 | he wanted to do visualized sets,\par 18 | he visualized them as regions that often use\par 19 | look round like a ball, like this ball.\par 20 | And he visualized sets as elements\par 21 | as points inside those regions.\par 22 | So for example if you consider the sets here one,\par 23 | then you would have all the integers maybe,\par 24 | minus one, zero, one, two, and we put this circle\par 25 | around zero and one, meaning zero and one intersect\par 26 | and minus one and two are not intersect.\par 27 | Or if the elements are whatever they could be,\par 28 | we can put the circle around the elements that\par 29 | belong to the set like here, so these are elements\par 30 | in the set and the elements outside are not in a set.\par 31 | Or if we don't even want to even draw the elements,\par 32 | we can just draw the circle and we know that whatever is\par 33 | inside will be intersect, and whatever is outside\par 34 | this region will not intersect.\par 35 | And a single set gets a little boring, so one thing led\par 36 | to another and he looked at two sets.\par 37 | So here are a collection of points, and if you have\par 38 | a set A so these will be the elements in a set A,\par 39 | these are all the points in A.\par 40 | And here we have another set, B, so all these four\par 41 | points here will be four points in B.\par 42 | And then if you look in this region here where the two sets\par 43 | overlap, those are the points that are both A and B,\par 44 | and here are the points that are neither A nor B.\par 45 | And of course things get much more interesting still if you\par 46 | have three sets, and so you have three sets, A, B, and C.\par 47 | And these are the points that are in A,\par 48 | but not in B, nor in C.\par 49 | And these are the points that are in A and B, but not in C.\par 50 | And these are the points that are A, B, and C.\par 51 | And outside are the points here are not in any set.\par 52 | So why are we interested in Venn diagrams?\par 53 | Because everyone knows that a\par 54 | picture is worth a thousand words.\par 55 | And when it comes to sets and probability,\par 56 | Venn diagram is going to allow us to replace\par 57 | very long proofs by visual and definitions by visual\par 58 | definitions and visual proofs.\par 59 | So in a sense, this picture is\par 60 | going to be worth a whole dictionary.\par 61 | And if you don't trust me, then maybe you'll\par 62 | believe Jon Stewart who was the host of The Daily Show,\par 63 | and together with his show mates, they wrote a book\par 64 | called Earth, which is supposed to introduce visitors\par 65 | to our planet, to Earth.\par 66 | And in this book, they have a section\par 67 | called Gods of Science.\par 68 | And John Venn gets a honorable mention.\par 69 | And he gets it for two things.\par 70 | First, for getting something right.\par 71 | And then equally important, for attaching his name to it.\par 72 | So if you look at this book, and you will find what\par 73 | according to them was the first Venn diagram,\par 74 | and here it is, it's the original Venn diagram.\par 75 | It consists of the set that contains the names of people,\par 76 | and the other set that contains the names of diagrams,\par 77 | and guess what's in the intersection,\par 78 | guess what's in both of those sets?\par 79 | John Venn, so if you want to see how to implement\par 80 | Venn diagrams in Python, then you'll need to\par 81 | download matplotlib_venn package.\par 82 | And there's a link in the notebook,\par 83 | and once you do that you can write\par 84 | import matplotlib.pyplot as plt\par 85 | and import matplotlib_venn as venn\par 86 | and then you need to find two sets\par 87 | and it has to be one, two, three.\par 88 | For example, T to be this set, and then to plot\par 89 | you write venn.venn2, two stands for plotting and\par 90 | Venn diagram of two sets, and then you can label them.\par 91 | You can incorporate the sets S and T and you can label\par 92 | them S, S, and T, otherwise it would just be like A and B.\par 93 | Then you just plot dot show them.\par 94 | And what you'll get is this, you'll get the set S,\par 95 | because of the label S.\par 96 | And the set T, because of this.\par 97 | And then you'll get the relative sizes,\par 98 | so you'll see which one is larger and which is smaller.\par 99 | And you can see the intersection.\par 100 | And you can also do Venn diagrams of three by\par 101 | venn.venn3 and you'll write three sets, S, T, and U.\par 102 | Whatever the set U is going to be.\par 103 | You set the labels that you want,\par 104 | and then you'll get a Venn diagram of three sets.\par 105 | All right, so we talked about set visualizations,\par 106 | specifically used Venn diagrams, we decided\par 107 | that we can use them to visualize, to think, to prove,\par 108 | and to understand sets and that's going to be very\par 109 | useful in the future for us.\par 110 | And next time we're going to talk about set relations.\par 111 | See you then.\par 112 | End of transcript. Skip to the start.\par 113 | Discussion\par 114 | } 115 | -------------------------------------------------------------------------------- /Week 2 Sets/2.7 Russell's Paradox Russell's Paradox.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - So this is our fourth\par 4 | and last video about sets,\par 5 | so I thought I'll show you\par 6 | how even the simple topics we discussed\par 7 | can lead to interesting and surprising consequences.\par 8 | In this case an interesting paradox.\par 9 | The paradox is due to Bertrand Russell,\par 10 | who was a British mathematician,\par 11 | philosopher and author,\par 12 | and amongst other things,\par 13 | he received a Nobel prize for literature.\par 14 | He was also a well known wit\par 15 | who had something to say about almost anything.\par 16 | And he had a special place in his heart\par 17 | for human intelligence.\par 18 | So here are a few of his quotes.\par 19 | He said that in democracy, fools have the right to vote,\par 20 | while in dictatorships fools have the right to rule.\par 21 | That most people would rather die than think,\par 22 | and in fact, most do.\par 23 | And perhaps most relevant to our course,\par 24 | he said that men are born ignorant, not stupid,\par 25 | but it's education that makes them stupid.\par 26 | So with that, let's educate ourselves about his paradox.\par 27 | Let's start with a small review about sets in sets.\par 28 | Remember that sets can be elements.\par 29 | For example, here is a set, and another set,\par 30 | and both of them are elements of the set\par 31 | that contains both of them.\par 32 | Also, you remember that every set\par 33 | can be a subset of itself.\par 34 | For example, the empty set is contained in itself.\par 35 | The more interesting question is whether\par 36 | a set can belong to, or be an element of itself.\par 37 | Namely, can a set S be an element of the set S?\par 38 | Now, typically sets do not belong to themselves.\par 39 | For example, if you take the set\par 40 | that contains just 0,\par 41 | it has a single element, which is the number 0.\par 42 | So the set zero is not a member of,\par 43 | or an element of the set 0.\par 44 | Similarly, the empty set contains no elements,\par 45 | and therefore in particular the empty set\par 46 | is not an element of the empty set.\par 47 | On the other hand, some sets do contain themselves,\par 48 | and do belong to themselves.\par 49 | So consider for example NT, non Trump,\par 50 | the set of anything that is not Trump.\par 51 | This set contains several elements.\par 52 | For example, Hillary Clinton is not Trump, so she's in.\par 53 | The number zero is not Donald Trump, so it is in the set.\par 54 | The set \{1,2\} is not Trump, so it's also in NT.\par 55 | And, when you think about it,\par 56 | lots of element are in the set.\par 57 | In fact, everything except Donald Trump is in the set.\par 58 | So, in particular, the set NT itself,\par 59 | it is not Donald Trump.\par 60 | So NT itself is in the set.\par 61 | So we therefore get that the set NT\par 62 | does belong, or is an element, of itself.\par 63 | So that's surprising, and indeed\par 64 | can lead to interesting consequences.\par 65 | One of them is that if you have a set that contains itself,\par 66 | for example if you take this slide,\par 67 | and if it contains itself,\par 68 | then the smaller slide is the same as the original slide,\par 69 | and therefore it contains itself again.\par 70 | And the smaller slide contains itself once more, and so on.\par 71 | So you get an infinite recursion that way.\par 72 | But don't worry about that.\par 73 | The only thing we need to know\par 74 | is that some sets are elements of themselves.\par 75 | For example the set NT is an element of itself.\par 76 | While other sets, like the set containing just 0,\par 77 | is not an element of itself.\par 78 | That's the only thing we need to know.\par 79 | And you notice this is something we could have discussed\par 80 | even after the first lecture.\par 81 | You don't need anything except that\par 82 | you have a set that contains itself,\par 83 | and a set that does not contain itself.\par 84 | So with that, what is Russell's Paradox?\par 85 | It is that you can define a set that cannot exist.\par 86 | So Russell considered the following set R.\par 87 | It's the set of all sets\par 88 | that don't belong to themselves.\par 89 | So in other words, it's the collection of all S,\par 90 | such that S is not an element of itself.\par 91 | Since this is our main definition,\par 92 | it may be worth thinking about it.\par 93 | We define a set such that\par 94 | if a set is an element of itself,\par 95 | then it is not in R.\par 96 | And if the set is not an element of itself,\par 97 | then it is in R.\par 98 | So now, clearly the set that contains 0,\par 99 | as we saw before, is not an element of itself.\par 100 | We saw it in the previous slide.\par 101 | And therefore, by our definition, it is in R.\par 102 | Conversely, NT was an element of itself,\par 103 | and therefore by our definition here, it is not in R.\par 104 | So the question is, if you take R itself,\par 105 | is R in R, or is R not in R?\par 106 | So clearly, one of those two things must hold.\par 107 | Either R is in R, or R is not in R.\par 108 | And what we are going to show\par 109 | is that both of those will lead to a contradiction.\par 110 | So this cannot happen.\par 111 | So imagine first that R is in R,\par 112 | namely R is an element of itself.\par 113 | Then by our definition, R cannot be in R,\par 114 | because the set R just contains the elements\par 115 | that are not in itself.\par 116 | So if R is in R, then R is not in our set,\par 117 | in Russell's set.\par 118 | And this means that R is both in R, and not in R,\par 119 | which is a contradiction.\par 120 | Now, if R is not in R, then,\par 121 | by the definition of our set,\par 122 | R is in Russell's set, and therefore R is in R,\par 123 | so again we get that R is both not in R, and in R,\par 124 | which is again a contradiction.\par 125 | And that means that if R existed,\par 126 | then it would both be that R is in R,\par 127 | and R would not be in R.\par 128 | Both of those would hold, and that's impossible,\par 129 | and therefore that means that R has been defined,\par 130 | but it cannot exist.\par 131 | So that's the contradiction.\par 132 | Now let's see what happened.\par 133 | So we defined the sets recursively,\par 134 | and that was the source of our problem.\par 135 | Because when we looked at sets\par 136 | that contained sets that contained themselves,\par 137 | then we got an infinite recursion.\par 138 | And when we considered the set\par 139 | that included sets that do not contain themselves,\par 140 | then we got a contradiction.\par 141 | So either way, once we got to recursive definitions,\par 142 | we were not in Kansas anymore.\par 143 | But don't worry, we're still\par 144 | in the continental United States,\par 145 | so we keep things simple.\par 146 | And we'll only consider sets\par 147 | that are not recursively defined,\par 148 | so we'll avoid sets that are self-referential.\par 149 | And also this material is not needed for the exam,\par 150 | although I think it's something\par 151 | that you probably want to know.\par 152 | Now this paradox has several variations.\par 153 | For example, you have a barber\par 154 | who shaves exactly the people\par 155 | who don't shave themselves.\par 156 | And the question is, does the barber shave himself,\par 157 | or does he not?\par 158 | Because if he shaves himself,\par 159 | then he shaves someone who does shave himself.\par 160 | And if he doesn't shave himself,\par 161 | then he doesn't shave everyone\par 162 | who does not shave themselves.\par 163 | And a similar paradox is, "This sentence is a lie."\par 164 | Is this sentence true, or not true?\par 165 | Even Pinocchio cannot figure this out.\par 166 | Or, "This is not a pipe," by Dali.\par 167 | And just in case you think these paradoxes\par 168 | are just philosophical issues,\par 169 | maybe you want to consider this.\par 170 | - I would never want to belong to any club\par 171 | that would have someone like me for a member.\par 172 | That's the key joke of my adult life\par 173 | in terms of my relationships with women.\par 174 | - I hope you learned, and enjoyed\par 175 | the videos, and next time\par 176 | we'll start talking about counting.\par 177 | Thank you.\par 178 | End of transcript. Skip to the start.\par 179 | POLL\par 180 | } 181 | -------------------------------------------------------------------------------- /Week 2 Sets/Problem Set 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 2 Sets/Problem Set 2.pdf -------------------------------------------------------------------------------- /Week 2 Sets/Programming Assignment.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 2 Sets/Programming Assignment.pdf -------------------------------------------------------------------------------- /Week 2 Sets/Quiz 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 2 Sets/Quiz 2.pdf -------------------------------------------------------------------------------- /Week 3 Counting and Combinatorics/1 Sets.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 Hello, and welcome back.\par 4 | So we're done talking about just sets on their own,\par 5 | and we're going to start talking about counting,\par 6 | and don't worry, we won't be counting sheep\par 7 | and fall asleep because we'll be counting sets,\par 8 | and in fact, our goal in these next few lectures\par 9 | is going to be to develop techniques\par 10 | so that actually we don't need to count almost at all.\par 11 | If we get it right, we will never need to count\par 12 | to more than just three.\par 13 | So, as we said, we're counting sets,\par 14 | so we need to describe what we're counting.\par 15 | The number of elements in a set S is called its size,\par 16 | or cardinality, and is denoted by these two vertical bars,\par 17 | the size of S, or pound S, again, the size of S.\par 18 | Let's do a few examples.\par 19 | The set of bits consists of zero and one,\par 20 | and it's cardinality is two.\par 21 | It has two elements.\par 22 | Coin has two sides, heads and tails.\par 23 | Its size is also two, and a die has six faces,\par 24 | one up to six, there are 10 digits,\par 25 | and there are 26 letters,\par 26 | and the empty set, noted like so,\par 27 | has no elements, so its cardinality, or its size, is zero,\par 28 | and on the other extreme, the set of integers Z,\par 29 | or the set of natural numbers N,\par 30 | or the set of positive integers,\par 31 | or the set of rational numbers\par 32 | all have infinitely many elements,\par 33 | and the reals also have infinitely many elements,\par 34 | or the size is also infinite.\par 35 | There's actually a difference between these two infinites,\par 36 | but we're not going to worry about that right now.\par 37 | These are called countably infinite,\par 38 | and these are called uncountably infinite,\par 39 | but let's not worry about them.\par 40 | Right?\par 41 | Let's look at two other sets and their sizes.\par 42 | Remember we discussed integer intervals?\par 43 | For m less than or equal to n,\par 44 | we define m dot dot n to be the set of integers\par 45 | from m to n, inclusive of both m and n,\par 46 | and for example, three dot dot five\par 47 | is the set three, four, five,\par 48 | and the size of m dot dot n is going to be\par 49 | n minus m plus one, and the question often asked\par 50 | is why the plus one, so first of all,\par 51 | we can try small numbers like five dot dot five.\par 52 | That's just the set five, so it's one,\par 53 | which is five minus five plus one,\par 54 | or one dot dot three.\par 55 | One, two, three.\par 56 | That size is three, and that's three minus one,\par 57 | three minus one, which will give us two plus one,\par 58 | which will give us three, and as you see,\par 59 | it's as easy as one, two, three,\par 60 | and so far we didn't need to count more than three,\par 61 | and to see why we're adding the one,\par 62 | let's look, for example, this interval\par 63 | three, four, five.\par 64 | It's the interval here.\par 65 | So you see we can have three points like that,\par 66 | but when we count five minus three,\par 67 | we're not counting the number of points or elements,\par 68 | we're just counting the distance\par 69 | between five and three, which is two,\par 70 | and what we're counting is\par 71 | the length of this segment from three to five,\par 72 | which is the number of unit intervals here,\par 73 | but what we're interested is in the number of points\par 74 | and there's one more point in the interval\par 75 | because there's a point to the left of every interval\par 76 | and then there's a point to the right\par 77 | of the right-most interval, so we need to add one.\par 78 | So five minus three plus one, which will give us three.\par 79 | Also, we talked about integer multiples.\par 80 | Let me just for clarity, we said\par 81 | that the set n is the set 1 up to n.\par 82 | That'll be denoted by open brace, open parentheses here\par 83 | and then a square bracket on the right\par 84 | to just indicate even more strongly\par 85 | that we're not using zero, so it's one up to n,\par 86 | just another notation.\par 87 | The set of integers between 1 and n\par 88 | that are multiples of d is denoted like that.\par 89 | It's a set of integers between one and n\par 90 | that are divisible by d or the multiples of d.\par 91 | For example, the set of integers that are multiples of three\par 92 | between one and eight is three and six,\par 93 | which is one times three and then two times three,\par 94 | and the set of integers between one and nine\par 95 | that are divisible by three, three, six, and nine,\par 96 | which is one times three, two times three,\par 97 | and three times three, and if you look\par 98 | at the size of the set, as you see,\par 99 | that it's n divided by d, and then we're taking\par 100 | the floor, the largest integer that is\par 101 | less than or equal to this ratio.\par 102 | For example, for integers between one and eight\par 103 | that are divisible by three, it's the floor\par 104 | of eight over three, which is two,\par 105 | and that's because we have one times three\par 106 | and two times three here, and the largest\par 107 | we can multiply is going to be, in this case,\par 108 | the floor of eight over three,\par 109 | and when we have the size of the integers\par 110 | that are multiples of three between one and nine,\par 111 | it's going to be the floor of nine over three,\par 112 | and that's three.\par 113 | That's because we have one times three,\par 114 | two times three, and three times three,\par 115 | and again, the number three comes from\par 116 | nine divided by three, and we take the floor of that.\par 117 | Now, in Python,\par 118 | set size is\par 119 | expressed using the len function.\par 120 | For example, we can write print len\par 121 | of minus one and one.\par 122 | It will give us two, and we can do other things,\par 123 | for example, we want to incorporate\par 124 | the sum of the elements in a set,\par 125 | we'll write sum, so print the sum of minus one, one\par 126 | will give us zero.\par 127 | To find the minimum of several elements,\par 128 | using the min function, print min of minus one and one,\par 129 | will give us minus one, and the maximum,\par 130 | using the max function, print the max of minus one and one,\par 131 | will give us one, and we can also do these calculations\par 132 | including sum, including length,\par 133 | by just iterating over elements or writing loops,\par 134 | so we say, for some variable in some set,\par 135 | do something, so, for example, here\par 136 | A is one, two, three, and if one, two,\par 137 | calculate the sum of A, and just write print sum of A,\par 138 | it will give us six, or we can say total is zero,\par 139 | and for i in A, total plus equals i,\par 140 | and print total, and that will give us, again, six.\par 141 | With that, we're going to continue next time\par 142 | by calculating the size of different sets\par 143 | like unions and intersections and so on.\par 144 | See you then.\par 145 | End of transcript. Skip to the start.\par 146 | I have completed this\par 147 | Previous\par 148 | } 149 | -------------------------------------------------------------------------------- /Week 3 Counting and Combinatorics/11 Binomial Coefficient.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 POLL\par 4 | \par 5 | Assume you are choosing a class schedule and you need to take 2 science classes and 2 art classes. If there are 6 science classes and 5 art classes available to choose from, how many different class schedules can you come up with?\par 6 | \par 7 | \tab\par 8 | 25\par 9 | \par 10 | \tab\par 11 | 55\par 12 | \par 13 | \tab\par 14 | 60\par 15 | \par 16 | \tab\par 17 | 150\par 18 | } 19 | -------------------------------------------------------------------------------- /Week 3 Counting and Combinatorics/4 Mix It Up.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 Hello, and welcome back.\par 4 | So, in the last video we talked about\par 5 | the size of general unions and now\par 6 | we would like to discuss Cartesian Products.\par 7 | So,\par 8 | if you have a set, for example, \{a,b\}\par 9 | and another set, \{1,2,3\},\par 10 | the set \{a,b\} is size two,\par 11 | the set \{1,2,3\} is size three,\par 12 | the size of the Cartesian Product \{a,b\}\par 13 | times \{1,2,3\}, is the size we know\par 14 | of this set here A one, A two, A three,\par 15 | B one, B two, B three, as we know this is\par 16 | the Cartesian Product of these two sets\par 17 | and the size of this set where we have\par 18 | three elements in the first row\par 19 | and\par 20 | so three elements in the first row\par 21 | and three elements in the second row,\par 22 | and these elements are disjoint.\par 23 | The elements here are different from\par 24 | the elements here, so therefore by the addition rule,\par 25 | the number of elements is three plus three\par 26 | which is the same as two times three which is six.\par 27 | Okay.\par 28 | And\par 29 | here is another interpretation of the same result.\par 30 | We have here all the elements,\par 31 | A one, A two, A three,\par 32 | B one, B two, B three,\par 33 | we have two rows and three columns,\par 34 | so the total number of elements\par 35 | that we have is two times three which is six sets.\par 36 | Essentially the same argument but maybe\par 37 | this gives more for an idea of volume\par 38 | oh, and area.\par 39 | Alright.\par 40 | So like this area is two times three which is six.\par 41 | Now,\par 42 | in general we can see that the size\par 43 | of a Cartesian Product is the product\par 44 | of the set sizes.\par 45 | So if you have two sets A and B,\par 46 | then the size of the Cartesian Product\par 47 | A times B is just going to be\par 48 | the size of A times the size of B.\par 49 | 'Kay, and this is called the product rule,\par 50 | which we'll denote by this X.\par 51 | And as you can see it's another\par 52 | application of the addition rule\par 53 | because we just add things,\par 54 | you know the size of A times.\par 55 | So here we added them twice,\par 56 | so it's just like a generalization\par 57 | if you want of the addition rule.\par 58 | Okay.\par 59 | Here's some applications of Cartesian Products.\par 60 | Let's start with\par 61 | the ubiquitous tables.\par 62 | So every table as we had said,\par 63 | that we use, is a Cartesian Product\par 64 | it has records which are\par 65 | viewed as rows.\par 66 | Here we have five records\par 67 | and it has attributes, here we have,\par 68 | which are viewed as columns,\par 69 | here we have three, and the number of cells that we have\par 70 | is the size of this Cartesian Product.\par 71 | Now we use\par 72 | multiplication principle is going,\par 73 | or the product rule, it's going to be\par 74 | five times three which is 15 cells.\par 75 | If you have more than two sets,\par 76 | so\par 77 | A times B, the Cartesian Product is set of\par 78 | all (a,b) such that a is in A\par 79 | and b is in B, and this gives\par 80 | you a rectangle,\par 81 | like that, and the size of the rectangle\par 82 | as we saw is just the product\par 83 | of the sizes of these two sets.\par 84 | If we have Cartesian Product of three sets,\par 85 | it's the collection of all (a,b,c) such that\par 86 | a is in A, b is in B, and c is in C,\par 87 | and that will, instead of a rectangle,\par 88 | is going to give us like a cuboid.\par 89 | Something that looks like that,\par 90 | and the number of elements there is\par 91 | going to be the size of this Cartesian Product\par 92 | is the number of elements.\par 93 | It's the size of A times the size of B times the size of C.\par 94 | 'Kay.\par 95 | So let's see an example.\par 96 | We have someone who likes\par 97 | to dress differently.\par 98 | And they have three shirts, let's say two pants,\par 99 | and five pairs of shoes.\par 100 | And one wonders how many days\par 101 | they can go with different outfits.\par 102 | So how many different outfits can they have?\par 103 | So notice that an outfit is just a triple.\par 104 | It's a shirt,\par 105 | a pant,\par 106 | a pair of pants,\par 107 | and a pair of shoes.\par 108 | So for example, and outfit may be\par 109 | the yellow shirt, the blue pants,\par 110 | and the red shoes, 'kay?\par 111 | So an outfit is just an order triple.\par 112 | And the set of outfits,\par 113 | so this is an outfit,\par 114 | and the set of outfits\par 115 | is the set of three tuples, or the triples,\par 116 | which is just a Cartesian Product of\par 117 | all shirts times all pants times all shoes.\par 118 | It's a Cartesian Product.\par 119 | And the size of the number of outfits,\par 120 | mainly the number of different outfits that there are,\par 121 | is by the product rule, is going to be\par 122 | the number of shirts times the number of pants\par 123 | times the number of shoes.\par 124 | And in this case, it's going to be three\par 125 | times two times five.\par 126 | Three\par 127 | times two times five,\par 128 | which is 30.\par 129 | 'Kay.\par 130 | And\par 131 | you know, you might think how this is again\par 132 | not very useful just counting the number of outfits.\par 133 | Maybe the following example, it will convince you\par 134 | that this is indispensable.\par 135 | So let's say you go to Costco\par 136 | and you want\par 137 | to\par 138 | buy tissue paper.\par 139 | 'Kay.\par 140 | Bathroom tissue.\par 141 | And so you get, you know, a package that\par 142 | looks a little bit like that\par 143 | and are trying to count how many\par 144 | rolls there are is hopeless.\par 145 | But what you can do is you can count and see\par 146 | that you know, there are maybe the sizes\par 147 | are three by three by four.\par 148 | And therefore, by the product rule\par 149 | the number of rolls that you have\par 150 | is three times three times four which is 36.\par 151 | Alright.\par 152 | So, if you have n sets,\par 153 | then again you can use the product rule\par 154 | and induction, you get the size of A one\par 155 | Cartesian Product A two up to A n\par 156 | is just the product of the sizes.\par 157 | And here is an application of that.\par 158 | Suppose you want to go for lunch.\par 159 | You want to go to Subway and you ask\par 160 | how many sandwiches can Subway make?\par 161 | And let's say that they have two breads\par 162 | which wheat and Italian.\par 163 | And they have let's say, five meats, listed here.\par 164 | Three cheeses,\par 165 | four veggies,\par 166 | three sauces,\par 167 | and let's say that you need to create\par 168 | a sandwich, you need to choose one of each.\par 169 | You need to choose a bread and a meat,\par 170 | a cheese and so on.\par 171 | Then the set of all sandwiches that you can make\par 172 | is the Cartesian Product of all breads times\par 173 | all meats times the set of cheeses, and so on.\par 174 | Because every sandwich consists of just\par 175 | one element from each.\par 176 | Okay, and you can choose all those,\par 177 | all those breads and so on.\par 178 | So the number of sandwiches is going to be\par 179 | the product of the number of breads and so on.\par 180 | Okay.\par 181 | And that's going to be by the,\par 182 | this is by the product rule\par 183 | and it's going to be equal to two times five\par 184 | times three times four times three\par 185 | which is 360.\par 186 | Now in fact, they have many more choices\par 187 | and number of sandwiches they can make\par 188 | is actually quite astronomical.\par 189 | Alright.\par 190 | So to summarize, we talked about the product rule,\par 191 | that says that the size of A Cartesian Product with B\par 192 | is the size of A times the size of B.\par 193 | Denotated by this multiplication sign.\par 194 | We said that for multi sets, for multiple sets,\par 195 | the size of the Cartesian Product is again\par 196 | the product of their sizes,\par 197 | and what are we going to talk about next time?\par 198 | Cartesian Powers.\par 199 | See you then.\par 200 | End of transcript. Skip to the start.\par 201 | POLL\par 202 | \par 203 | Which is better for calculating the cardinality of a set: inclusion-exclusion rule or complement rule?\par 204 | \par 205 | \tab\par 206 | It depends\par 207 | \par 208 | \tab\par 209 | Inclusion-Exclusion Rule\par 210 | \par 211 | \tab\par 212 | Complement Rule\par 213 | } 214 | -------------------------------------------------------------------------------- /Week 3 Counting and Combinatorics/6 Counting Variations.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 POLL\par 4 | \par 5 | A student is given a problem: What is the size of the set of 2-letter sequences that cannot use the letter Q? The student first calculates the total number of sequences using all letters to be 26*26=676. He then calculates the number of forbidden sequences. First, he calculates the number of sequences where a Q is the first letter (i.e. QS) to be 26. Next, he calculates the number of sequences where a Q is the second letter (i.e. BQ) to also be 26. He then adds 26+26 to get the total number of forbidden sequences and subtracts them from the total to get 624. What is wrong with his approach?\par 6 | \par 7 | \tab\par 8 | He subtracted the sequence QQ from the total twice.\par 9 | \par 10 | \tab\par 11 | The sequences with a Q as the second letter only total 24.\par 12 | \par 13 | \tab\par 14 | He incorrectly calculated the total number of sequences using any letters.\par 15 | \par 16 | \tab\par 17 | Nothing, his answer is correct.\par 18 | \par 19 | Submit\par 20 | Discussion\par 21 | } 22 | -------------------------------------------------------------------------------- /Week 3 Counting and Combinatorics/9 Partial Permutations.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Welcome again everyone.\par 4 | Last lecture, we talked about permutations\par 5 | and now we want to talk about partial permutations.\par 6 | These are permutations where you don't want to\par 7 | arrange all the objects that you have,\par 8 | but just some subset.\par 9 | So we know that the number of ways\par 10 | to arrange n objects is n factorial\par 11 | and we would like to determine how many ways,\par 12 | or in how many orders,\par 13 | can you arrange some of the n objects.\par 14 | So for example,\par 15 | let's look at PINs that consist of two digits.\par 16 | So if you allow any digits,\par 17 | then, for example, 11 or 45,\par 18 | in other words, you allow things to repeat,\par 19 | then, the total number of PINs that you have\par 20 | is ten times ten.\par 21 | Ten for the first choice and ten for the second choice,\par 22 | so it's ten times ten which is 100.\par 23 | On the other hand,\par 24 | if you insist on the digits being distinct\par 25 | then, for example,\par 26 | you allow 05 and 32 but not 33, because three repeats,\par 27 | then you have ten options for the first digit\par 28 | and nine options for the second digit\par 29 | and the total number of ways you can do it\par 30 | is ten times nine, which is 90.\par 31 | Similarly, if you look at three letter words,\par 32 | then if you allow any letters,\par 33 | for example, mom or xyz,\par 34 | so for example here we see repetition,\par 35 | then you have 26 for each letter\par 36 | so 26 cubed is the total number of\par 37 | three letter words that you can come up with,\par 38 | but if you insist on distinct letters,\par 39 | and you allow things like abc but not dad,\par 40 | then the number of different sequences you can have\par 41 | is you have 26 options for the first letter,\par 42 | and then 25 for the second, 24 for the third,\par 43 | so you just get this product.\par 44 | And as you can see,\par 45 | it's just the calculation that we did for permutations,\par 46 | except that we're not going all the way down to one,\par 47 | but we're just finding just three out of this 26 elements.\par 48 | In other words,\par 49 | what we're counting here is we're saying we have 26 elements\par 50 | in this case, for example, the 26 letters\par 51 | and we don't want to arrange all of them,\par 52 | but we want to arrange three of them.\par 53 | The question is how many ways can we do it.\par 54 | And another way to view this thing\par 55 | in a completely identical way\par 56 | is to say that we are looking instead of sequences\par 57 | where we allow all possible sequences,\par 58 | this would give us the Cartesian product,\par 59 | now we're looking at sequences\par 60 | where we insist that all the elements are distinct.\par 61 | Okay, so that's another view of the same problem.\par 62 | So how many partial permutations are there?\par 63 | We call such permutations where you have n objects\par 64 | and you want to arrange k of them,\par 65 | we call it a permutation of k out of n objects,\par 66 | or we call it a k-permutation of n.\par 67 | And we want to calculate\par 68 | how many such k-permutations of n there are.\par 69 | So as we just said,\par 70 | there are n ways to write the first element,\par 71 | and then n minus one to select the second element\par 72 | and so on until we get to n minus k plus one\par 73 | to select the kth element,\par 74 | right, 'cause it's n for the first one\par 75 | and n minus one for the second\par 76 | so it's going to be n minus k plus one for the kth element.\par 77 | And this number, we can easily see\par 78 | that we can continue it up to, if you want,\par 79 | times n minus k all the way down, times one\par 80 | and then divide by this ending\par 81 | so we get that this is\par 82 | n factorial divided by n minus k factorial.\par 83 | And we denote this by n to the k with an underline.\par 84 | That's what we call it today,\par 85 | kth falling power of n.\par 86 | And some people denote it by P(n,k),\par 87 | for permutation of n elements where we take just k of them.\par 88 | So here are some values of the kth falling power of n\par 89 | for small values of k.\par 90 | So for k equals one, n to the one falling power is just n.\par 91 | For k equals two it's n times n minus one.\par 92 | For k equal to three,\par 93 | it's n times n minus one times n minus two\par 94 | and so on up to k,\par 95 | it's n times n minus one times n minus k plus one\par 96 | as we define it to be.\par 97 | So, let's finish with an example of partial permutations.\par 98 | So imagine that you have five programming books,\par 99 | five probability books and six machine-learning books.\par 100 | And you wonder how many ways can you write a list\par 101 | that contains two books from each subject\par 102 | where books from the same subject are listed consecutively,\par 103 | namely next to each other.\par 104 | So we wonder how many such sequences there are.\par 105 | Examples of lists like that would be\par 106 | the third probability book, the first probability book,\par 107 | and then the fifth machine-learning book\par 108 | and the second machine-learning book,\par 109 | and programming book number one\par 110 | and programming book number four.\par 111 | We can see that the probability books are listed together.\par 112 | The machine-learning books are listed together\par 113 | and the programming books are listed together.\par 114 | Or machine-learning two and machine-learning six\par 115 | followed by programming one, programming two\par 116 | and so on again all the books from the same topic\par 117 | are listed next to each other.\par 118 | So we wonder how many such sequences there are.\par 119 | So we're going to use a combination\par 120 | of factorials and permutations\par 121 | and in some sense we're using the product rule many times.\par 122 | And they will show that the answer is three factorial\par 123 | times four to the second falling power\par 124 | times five to the second falling power\par 125 | times six to the second falling power.\par 126 | To see that, notice that to first decide\par 127 | we can first decide on the order of the topics.\par 128 | For example, here machine-learning, programming,\par 129 | and then probability.\par 130 | So there are three different topics\par 131 | and the number of ways to arrange them\par 132 | is therefore three factorial\par 133 | to decide on the order of the subjects\par 134 | like here's machine learning, programming, and probability.\par 135 | And then once we do that,\par 136 | we need to decide which machine-learning book\par 137 | we're going to choose first\par 138 | and which one we're going to choose second.\par 139 | And because there are six machine learning books\par 140 | I'm sorry,\par 141 | which programming book we're going to put first\par 142 | and which programming book we're going to put second.\par 143 | 'Cause there are four programming books,\par 144 | then we have four to the second falling power\par 145 | number of ways to do that\par 146 | because we can decide on the first programming book\par 147 | and then the second programming book\par 148 | and then we can decide on which probability books\par 149 | we're going to choose\par 150 | and since there are five of them\par 151 | we have five to the second falling power\par 152 | or five times 4 ways of doing that\par 153 | and then we need to decide which machine-learning books\par 154 | we are listing and in what order\par 155 | and we have six ways to choose the first one\par 156 | and five to do the second one\par 157 | or in other words we have\par 158 | six to the second falling power to do that.\par 159 | And we take the product of all these,\par 160 | by the product rules\par 161 | because we are making a decision for the order\par 162 | and making a separate decision\par 163 | for which programming books we choose,\par 164 | which probability books we choose, and so on,\par 165 | so we take the product,\par 166 | we get the total number of ways of writing such lists.\par 167 | So we have talked about permutations\par 168 | and we've talked about partial permutations\par 169 | and next time we'll talk about combinations.\par 170 | See you then.\par 171 | End of transcript. Skip to the start.\par 172 | POLL\par 173 | \par 174 | How many partial permutations are there for a 4-digit number where no two consecutive numbers are the same?\par 175 | \par 176 | \tab\par 177 | 5040\par 178 | \par 179 | \tab\par 180 | 6561\par 181 | \par 182 | \tab\par 183 | 7290\par 184 | \par 185 | \tab\par 186 | 10000\par 187 | } 188 | -------------------------------------------------------------------------------- /Week 3 Counting and Combinatorics/Week_3_Part_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 3 Counting and Combinatorics/Week_3_Part_1.pdf -------------------------------------------------------------------------------- /Week 3 Counting and Combinatorics/Week_3_Part_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 3 Counting and Combinatorics/Week_3_Part_2.pdf -------------------------------------------------------------------------------- /Week 3 Counting and Combinatorics/week3.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 3 Counting and Combinatorics/week3.zip -------------------------------------------------------------------------------- /Week 4 Probability and Conditioning/1 Distribution Types.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 Hello, and welcome back.\par 4 | So now that we have discussed randomness,\par 5 | and distributions, let's talk about\par 6 | distribution types.\par 7 | We're going to discuss uniform sample spaces;\par 8 | give some examples of coins, dice, and cards;\par 9 | and then we'll talk about non-uniform spaces,\par 10 | and give an example for tetrahedral die.\par 11 | We'll start with uniform probability spaces.\par 12 | Generally, outcomes may have different probabilities.\par 13 | For example, rain may have 10% probability\par 14 | of occurring on a given day,\par 15 | and 90% probability that it doesn't happen.\par 16 | But what we want to talk about now\par 17 | are uniform, also called equiprobable, spaces\par 18 | where the distribution is uniform.\par 19 | Everything has the same probability.\par 20 | All outcomes are equally likely.\par 21 | For example, if you flip a coin,\par 22 | then if the coin is far,\par 23 | then the probability of heads\par 24 | and the probability of tails are both one half.\par 25 | When you have a uniform space,\par 26 | then all probability specifications\par 27 | and calculations become a lot simpler, as we'll see.\par 28 | So as we said, in uniform probability spaces,\par 29 | all outcomes are equally likely.\par 30 | In other words, for all X in the sample space,\par 31 | the probability of any X is the same P.\par 32 | Now we know that one is going to be\par 33 | the probability of the whole sample space,\par 34 | and as we know, for probability distribution,\par 35 | when you sum the probabilities, it has to be one.\par 36 | Okay, and therefore we get the summation\par 37 | of all X in the sample space of P is one.\par 38 | This is just P times the size of the sample space,\par 39 | and that tells us that P, the probability\par 40 | of every element, is one over the size of the sample space,\par 41 | or one of the sides of one coin.\par 42 | For example, if we take a coin,\par 43 | probability of heads and probability of tails.\par 44 | If we assume it's P, then we have that\par 45 | one is P of heads plus P of tails, or 2P,\par 46 | and therefore P must be one half.\par 47 | In uniform spaces in general,\par 48 | we see that every outcome has probability\par 49 | which is one over the size of the sample space,\par 50 | like here one over two.\par 51 | We call uniform spaces, we'll denote them\par 52 | by this U for uniform.\par 53 | All you need to know for uniform distribution\par 54 | to know all the probabilities\par 55 | is just to know the size of the sample space.\par 56 | That makes them so simple for us.\par 57 | And one notation is that when\par 58 | we have uniform sample space,\par 59 | we'll say that we draw uniformly from the space\par 60 | or we just draw randomly.\par 61 | So randomly, when we don't say anything else,\par 62 | we'll signify that we're drawing uniformly.\par 63 | So sometimes just say randomly, and mean uniformly.\par 64 | Let's see a couple of examples.\par 65 | For a coin, the sample space consists\par 66 | of heads and tails, which we'll denote by H and T.\par 67 | The size of the sample space is two.\par 68 | We'll say that we'll flip a coin,\par 69 | or toss a coin, and the results are equally likely.\par 70 | In other words, the space is uniform.\par 71 | That means the probability of heads\par 72 | is equal to the probability of tails.\par 73 | As we saw in the previous slide,\par 74 | that means that each of them has a probability\par 75 | which is one over the size of the sample space,\par 76 | or one over two.\par 77 | If we take a fair die,\par 78 | the possible outcomes are one up to six.\par 79 | The size of the sample space is six.\par 80 | Therefore, if you roll the die,\par 81 | if we assume the die's fair,\par 82 | the faces are equally likely.\par 83 | In other words, the space is uniform.\par 84 | It's noted by U.\par 85 | So the probability of one\par 86 | is equal to the probability of two, and so on\par 87 | up to the probability of six.\par 88 | That means that the probability of each element\par 89 | is one over the sample size, or one over six.\par 90 | If we have a deck of cards,\par 91 | then we let (mumbles) the set of four cards.\par 92 | In this case, maybe 52 in the standard deck.\par 93 | If we draw a card, everything is equally likely.\par 94 | You know, a U.\par 95 | The probability of getting maybe a three of clubs,\par 96 | or a queen of hearts, all of them equally likely,\par 97 | which is one over the size of the sample space,\par 98 | which is one over 52.\par 99 | Now, in many cases, the space is non-uniform.\par 100 | Uniform spaces are coins and dies and so on,\par 101 | but in those cases everything is pretty good.\par 102 | But in nature, non-uniform spaces abound.\par 103 | We give the example of rain,\par 104 | also grades that we get\par 105 | are not equally likely to be an A plus as other grades.\par 106 | Our words.\par 107 | Some words are more likely than others.\par 108 | Some illnesses are more likely than others.\par 109 | Some web pages are visited more often.\par 110 | People that we see are not randomly distributed\par 111 | among all seven billion people on the planet,\par 112 | and so on.\par 113 | So we need to look at them.\par 114 | Now, the many examples of non-uniform spaces,\par 115 | and one typical example is a pie chart\par 116 | that we see here.\par 117 | These reflect the different probabilities\par 118 | of possible outcomes, as we see here.\par 119 | When we see a pie chart,\par 120 | what you see is, actually, could be uniform,\par 121 | but typically a non-uniform space.\par 122 | Typically the challenge with non-uniform spaces\par 123 | is how can we remember it if we're going to give examples.\par 124 | So we're going to try to give a simple example\par 125 | of a tetrahedral die.\par 126 | This is a four-sided, or pyramid die as it's also called.\par 127 | It's used in different games, board games.\par 128 | For example, Dungeons and Dragons.\par 129 | Typically, in those cases, the die's equiprobable.\par 130 | Each of the four faces has the same probability.\par 131 | But we're just going to give it some different values.\par 132 | We'll assume different probabilities.\par 133 | We'll try to make it easy to remember.\par 134 | The faces are one, two, three, four,\par 135 | and we'll just give them probabilities\par 136 | of .1, .2, .3, and .4.\par 137 | Notice that, conveniently, they add to one,\par 138 | and therefore this is a probability distribution.\par 139 | Now that we have these examples in mind,\par 140 | let's go over a few things about distributions\par 141 | and see what we can say about them or can't.\par 142 | Notice that random notation may be\par 143 | a little confusing at first.\par 144 | Which expressions are valid, and which are not?\par 145 | For example, we can say the probability\par 146 | that X is equal to three.\par 147 | That's a valid expression.\par 148 | That's the probability that the random outcome\par 149 | is going to turn up three.\par 150 | For example, if you have a fair die,\par 151 | this probability is 1/6th.\par 152 | So that's an okay expression.\par 153 | Can also write P of three.\par 154 | That's an abbreviation for the probability that X is three.\par 155 | This is just the same as writing\par 156 | the probability of X is three.\par 157 | You can also write P of X,\par 158 | but then we need to specify X.\par 159 | For example, we can say that,\par 160 | for X, P of X is a quarter.\par 161 | If you have a fair tetrahedral die, for example.\par 162 | Now things that are maybe not so clear\par 163 | what they can do is if you write probability\par 164 | of one equal to three, or probability\par 165 | of the random variable X.\par 166 | Those could happen, but not so likely.\par 167 | For example, here what you might mean\par 168 | is that you have a random variable,\par 169 | which is always one, and another random variable,\par 170 | and you ask what is the probability\par 171 | that it's equal to three.\par 172 | Then that probability is going to be zero.\par 173 | Here, maybe you mean you have random variable X,\par 174 | and you ask what is the probability\par 175 | of the random outcome that you observe.\par 176 | These are possible, but less common,\par 177 | so make sure that this is what you mean.\par 178 | This typically would mean zero,\par 179 | and this is a random value, as we said.\par 180 | Things that you should not do is,\par 181 | for example, write the probability\par 182 | that little X is equal to three,\par 183 | because little X is a value.\par 184 | Cannot write the probability it's equal to three.\par 185 | This is even less likely, and probably it's wrong.\par 186 | So, to summarize, we talked about\par 187 | different distribution types.\par 188 | We talked about uniform sample spaces;\par 189 | coins, dies, and cards as examples.\par 190 | We talked about non-uniform spaces.\par 191 | We gave the example for tetrahedral die.\par 192 | What are we going to do next time?\par 193 | We'll discuss events.\par 194 | See you then.\par 195 | End of transcript. Skip to the start.\par 196 | Discussion\par 197 | Topic: Week 4 / Distributions\par 198 | Show Discussion\par 199 | POLL\par 200 | \par 201 | An outcome in a uniform probability space has probability 1/10, what is the size of the sample space?\par 202 | \par 203 | \tab\par 204 | 5\par 205 | \par 206 | \tab\par 207 | 10\par 208 | \par 209 | \tab\par 210 | 20\par 211 | } 212 | -------------------------------------------------------------------------------- /Week 4 Probability and Conditioning/2 Distribution Types.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 Hello, and welcome back.\par 4 | So now that we have discussed randomness,\par 5 | and distributions, let's talk about\par 6 | distribution types.\par 7 | We're going to discuss uniform sample spaces;\par 8 | give some examples of coins, dice, and cards;\par 9 | and then we'll talk about non-uniform spaces,\par 10 | and give an example for tetrahedral die.\par 11 | We'll start with uniform probability spaces.\par 12 | Generally, outcomes may have different probabilities.\par 13 | For example, rain may have 10% probability\par 14 | of occurring on a given day,\par 15 | and 90% probability that it doesn't happen.\par 16 | But what we want to talk about now\par 17 | are uniform, also called equiprobable, spaces\par 18 | where the distribution is uniform.\par 19 | Everything has the same probability.\par 20 | All outcomes are equally likely.\par 21 | For example, if you flip a coin,\par 22 | then if the coin is far,\par 23 | then the probability of heads\par 24 | and the probability of tails are both one half.\par 25 | When you have a uniform space,\par 26 | then all probability specifications\par 27 | and calculations become a lot simpler, as we'll see.\par 28 | So as we said, in uniform probability spaces,\par 29 | all outcomes are equally likely.\par 30 | In other words, for all X in the sample space,\par 31 | the probability of any X is the same P.\par 32 | Now we know that one is going to be\par 33 | the probability of the whole sample space,\par 34 | and as we know, for probability distribution,\par 35 | when you sum the probabilities, it has to be one.\par 36 | Okay, and therefore we get the summation\par 37 | of all X in the sample space of P is one.\par 38 | This is just P times the size of the sample space,\par 39 | and that tells us that P, the probability\par 40 | of every element, is one over the size of the sample space,\par 41 | or one of the sides of one coin.\par 42 | For example, if we take a coin,\par 43 | probability of heads and probability of tails.\par 44 | If we assume it's P, then we have that\par 45 | one is P of heads plus P of tails, or 2P,\par 46 | and therefore P must be one half.\par 47 | In uniform spaces in general,\par 48 | we see that every outcome has probability\par 49 | which is one over the size of the sample space,\par 50 | like here one over two.\par 51 | We call uniform spaces, we'll denote them\par 52 | by this U for uniform.\par 53 | All you need to know for uniform distribution\par 54 | to know all the probabilities\par 55 | is just to know the size of the sample space.\par 56 | That makes them so simple for us.\par 57 | And one notation is that when\par 58 | we have uniform sample space,\par 59 | we'll say that we draw uniformly from the space\par 60 | or we just draw randomly.\par 61 | So randomly, when we don't say anything else,\par 62 | we'll signify that we're drawing uniformly.\par 63 | So sometimes just say randomly, and mean uniformly.\par 64 | Let's see a couple of examples.\par 65 | For a coin, the sample space consists\par 66 | of heads and tails, which we'll denote by H and T.\par 67 | The size of the sample space is two.\par 68 | We'll say that we'll flip a coin,\par 69 | or toss a coin, and the results are equally likely.\par 70 | In other words, the space is uniform.\par 71 | That means the probability of heads\par 72 | is equal to the probability of tails.\par 73 | As we saw in the previous slide,\par 74 | that means that each of them has a probability\par 75 | which is one over the size of the sample space,\par 76 | or one over two.\par 77 | If we take a fair die,\par 78 | the possible outcomes are one up to six.\par 79 | The size of the sample space is six.\par 80 | Therefore, if you roll the die,\par 81 | if we assume the die's fair,\par 82 | the faces are equally likely.\par 83 | In other words, the space is uniform.\par 84 | It's noted by U.\par 85 | So the probability of one\par 86 | is equal to the probability of two, and so on\par 87 | up to the probability of six.\par 88 | That means that the probability of each element\par 89 | is one over the sample size, or one over six.\par 90 | If we have a deck of cards,\par 91 | then we let (mumbles) the set of four cards.\par 92 | In this case, maybe 52 in the standard deck.\par 93 | If we draw a card, everything is equally likely.\par 94 | You know, a U.\par 95 | The probability of getting maybe a three of clubs,\par 96 | or a queen of hearts, all of them equally likely,\par 97 | which is one over the size of the sample space,\par 98 | which is one over 52.\par 99 | Now, in many cases, the space is non-uniform.\par 100 | Uniform spaces are coins and dies and so on,\par 101 | but in those cases everything is pretty good.\par 102 | But in nature, non-uniform spaces abound.\par 103 | We give the example of rain,\par 104 | also grades that we get\par 105 | are not equally likely to be an A plus as other grades.\par 106 | Our words.\par 107 | Some words are more likely than others.\par 108 | Some illnesses are more likely than others.\par 109 | Some web pages are visited more often.\par 110 | People that we see are not randomly distributed\par 111 | among all seven billion people on the planet,\par 112 | and so on.\par 113 | So we need to look at them.\par 114 | Now, the many examples of non-uniform spaces,\par 115 | and one typical example is a pie chart\par 116 | that we see here.\par 117 | These reflect the different probabilities\par 118 | of possible outcomes, as we see here.\par 119 | When we see a pie chart,\par 120 | what you see is, actually, could be uniform,\par 121 | but typically a non-uniform space.\par 122 | Typically the challenge with non-uniform spaces\par 123 | is how can we remember it if we're going to give examples.\par 124 | So we're going to try to give a simple example\par 125 | of a tetrahedral die.\par 126 | This is a four-sided, or pyramid die as it's also called.\par 127 | It's used in different games, board games.\par 128 | For example, Dungeons and Dragons.\par 129 | Typically, in those cases, the die's equiprobable.\par 130 | Each of the four faces has the same probability.\par 131 | But we're just going to give it some different values.\par 132 | We'll assume different probabilities.\par 133 | We'll try to make it easy to remember.\par 134 | The faces are one, two, three, four,\par 135 | and we'll just give them probabilities\par 136 | of .1, .2, .3, and .4.\par 137 | Notice that, conveniently, they add to one,\par 138 | and therefore this is a probability distribution.\par 139 | Now that we have these examples in mind,\par 140 | let's go over a few things about distributions\par 141 | and see what we can say about them or can't.\par 142 | Notice that random notation may be\par 143 | a little confusing at first.\par 144 | Which expressions are valid, and which are not?\par 145 | For example, we can say the probability\par 146 | that X is equal to three.\par 147 | That's a valid expression.\par 148 | That's the probability that the random outcome\par 149 | is going to turn up three.\par 150 | For example, if you have a fair die,\par 151 | this probability is 1/6th.\par 152 | So that's an okay expression.\par 153 | Can also write P of three.\par 154 | That's an abbreviation for the probability that X is three.\par 155 | This is just the same as writing\par 156 | the probability of X is three.\par 157 | You can also write P of X,\par 158 | but then we need to specify X.\par 159 | For example, we can say that,\par 160 | for X, P of X is a quarter.\par 161 | If you have a fair tetrahedral die, for example.\par 162 | Now things that are maybe not so clear\par 163 | what they can do is if you write probability\par 164 | of one equal to three, or probability\par 165 | of the random variable X.\par 166 | Those could happen, but not so likely.\par 167 | For example, here what you might mean\par 168 | is that you have a random variable,\par 169 | which is always one, and another random variable,\par 170 | and you ask what is the probability\par 171 | that it's equal to three.\par 172 | Then that probability is going to be zero.\par 173 | Here, maybe you mean you have random variable X,\par 174 | and you ask what is the probability\par 175 | of the random outcome that you observe.\par 176 | These are possible, but less common,\par 177 | so make sure that this is what you mean.\par 178 | This typically would mean zero,\par 179 | and this is a random value, as we said.\par 180 | Things that you should not do is,\par 181 | for example, write the probability\par 182 | that little X is equal to three,\par 183 | because little X is a value.\par 184 | Cannot write the probability it's equal to three.\par 185 | This is even less likely, and probably it's wrong.\par 186 | So, to summarize, we talked about\par 187 | different distribution types.\par 188 | We talked about uniform sample spaces;\par 189 | coins, dies, and cards as examples.\par 190 | We talked about non-uniform spaces.\par 191 | We gave the example for tetrahedral die.\par 192 | What are we going to do next time?\par 193 | We'll discuss events.\par 194 | See you then.\par 195 | End of transcript. Skip to the start.\par 196 | Discussion\par 197 | } 198 | -------------------------------------------------------------------------------- /Week 4 Probability and Conditioning/4_Permutations_and_Combinations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 4 Permutations and Combinations\n", 8 | "\n", 9 | "## 4.1 Permutations\n", 10 | "\n", 11 | "We find the number of $k$-permutations of $A$, first by determining the set of permutations and then by calculating $\\frac{|A|!}{(|A|-k)!}$. We first consider the special case of $k=|A|$, which is equivalent to finding the number of ways of ordering the elements of $A$. First we import the **itertools** library." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 33, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import itertools" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 34, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "A = {1, 2, 3}" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 35, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Permutations of set([1, 2, 3]): \n", 46 | "(1, 3, 2)\n", 47 | "(3, 2, 1)\n", 48 | "(2, 1, 3)\n", 49 | "(3, 1, 2)\n", 50 | "(1, 2, 3)\n", 51 | "(2, 3, 1)\n", 52 | "\n", 53 | "Number of permutations: 6\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "# Find all permutations of A and |A!|\n", 59 | "permute_all = set(itertools.permutations(A))\n", 60 | "print(\"Permutations of %s: \" %A)\n", 61 | "for i in permute_all:\n", 62 | " print(i)\n", 63 | "print;print \"Number of permutations: \", len(permute_all)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 36, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "6\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "# Find |A|! directly\n", 81 | "from math import factorial\n", 82 | "print(factorial(len(A)))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 37, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "A = {1, 2, 3, 4}\n", 94 | "k = 3" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 38, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "3-permutations of set([1, 2, 3, 4]): \n", 107 | "(1, 2, 3)\n", 108 | "(1, 2, 4)\n", 109 | "(1, 3, 2)\n", 110 | "(1, 3, 4)\n", 111 | "(1, 4, 2)\n", 112 | "(1, 4, 3)\n", 113 | "(2, 1, 3)\n", 114 | "(2, 1, 4)\n", 115 | "(2, 3, 1)\n", 116 | "(2, 3, 4)\n", 117 | "(2, 4, 1)\n", 118 | "(2, 4, 3)\n", 119 | "(3, 1, 2)\n", 120 | "(3, 1, 4)\n", 121 | "(3, 2, 1)\n", 122 | "(3, 2, 4)\n", 123 | "(3, 4, 1)\n", 124 | "(3, 4, 2)\n", 125 | "(4, 1, 2)\n", 126 | "(4, 1, 3)\n", 127 | "(4, 2, 1)\n", 128 | "(4, 2, 3)\n", 129 | "(4, 3, 1)\n", 130 | "(4, 3, 2)\n", 131 | "\n", 132 | "Size = 4!/(4-3)! = 24\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "# Print all the k-permutations of A\n", 138 | "n = len(A)\n", 139 | "permute_k = list(itertools.permutations(A, k))\n", 140 | "print(\"%i-permutations of %s: \" %(k,A))\n", 141 | "for i in permute_k:\n", 142 | " print(i)\n", 143 | "print;print \"Size = \", \"%i!/(%i-%i)! = \" %(n,n,k), len(permute_k)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 39, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "24\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "# Print |A|!/(|A|-k)! directly\n", 161 | "print(int(factorial(len(A))/factorial(len(A)-k)))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "## 4.2 Combinations\n", 169 | "We find the number of $k$-combinations of $A$, first by determining the set of combinations and then by simply calculating ${|A|}\\choose{k}$." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 40, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "from scipy.special import binom # to calculate the binomial coefficients |A| choose k" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 41, 186 | "metadata": { 187 | "collapsed": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "A = {1, 2, 3, 4}\n", 192 | "k = 2" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 42, 198 | "metadata": { 199 | "scrolled": true 200 | }, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "2-combinations of set([1, 2, 3, 4]): \n", 207 | "(1, 2)\n", 208 | "(1, 3)\n", 209 | "(1, 4)\n", 210 | "(2, 3)\n", 211 | "(2, 4)\n", 212 | "(3, 4)\n", 213 | "\n", 214 | "Number of combinations = 4!/(2!(4-2)!) = 6\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "# Print all the k-combinations of A\n", 220 | "choose_k = list(itertools.combinations(A,k))\n", 221 | "print(\"%i-combinations of %s: \" %(k,A))\n", 222 | "for i in choose_k:\n", 223 | " print(i)\n", 224 | "print;print(\"Number of combinations = %i!/(%i!(%i-%i)!) = %i\" %(n,k,n,k,len(choose_k) ))" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 43, 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "name": "stdout", 234 | "output_type": "stream", 235 | "text": [ 236 | "6\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "# Print |A|!/(k!(|A|-k)!) directly\n", 242 | "print(int(factorial(len(A))/(factorial(k)*factorial(len(A)-k))))" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "If you want to concatenate characters such as letters of the English alphabet and print them as strings, you can use the join() function." 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 44, 255 | "metadata": { 256 | "collapsed": true 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "A = {'a', 'b', 'c', 'q'}\n", 261 | "k = 3" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 45, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "name": "stdout", 271 | "output_type": "stream", 272 | "text": [ 273 | "3-permutations of set(['a', 'q', 'c', 'b']):\n", 274 | "aqc\n", 275 | "aqb\n", 276 | "acq\n", 277 | "acb\n", 278 | "abq\n", 279 | "abc\n", 280 | "qac\n", 281 | "qab\n", 282 | "qca\n", 283 | "qcb\n", 284 | "qba\n", 285 | "qbc\n", 286 | "caq\n", 287 | "cab\n", 288 | "cqa\n", 289 | "cqb\n", 290 | "cba\n", 291 | "cbq\n", 292 | "baq\n", 293 | "bac\n", 294 | "bqa\n", 295 | "bqc\n", 296 | "bca\n", 297 | "bcq\n", 298 | "\n", 299 | "Size = 4!/(4-3)! = 24\n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "# Print all the k-permutations of S\n", 305 | "n = len(A)\n", 306 | "permute_k = list(itertools.permutations(A, k))\n", 307 | "print(\"%i-permutations of %s:\" %(k,A))\n", 308 | "for i in range(0, len(permute_k)):\n", 309 | " print(''.join(permute_k[i]) )\n", 310 | "print;print \"Size = %i!/(%i-%i)! = \" %(n,n,k), len(permute_k)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 46, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "name": "stdout", 320 | "output_type": "stream", 321 | "text": [ 322 | "24\n" 323 | ] 324 | } 325 | ], 326 | "source": [ 327 | "# Print |A|!/(|A|-k)! directly\n", 328 | "print(int(factorial(len(A))/factorial(len(A)-k)))" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 47, 334 | "metadata": { 335 | "collapsed": true 336 | }, 337 | "outputs": [], 338 | "source": [ 339 | "A = {'a', 'b', 'c', 'd'}\n", 340 | "k = 2" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 48, 346 | "metadata": { 347 | "scrolled": true 348 | }, 349 | "outputs": [ 350 | { 351 | "name": "stdout", 352 | "output_type": "stream", 353 | "text": [ 354 | "2-combinations of set(['a', 'c', 'b', 'd']):\n", 355 | "\n", 356 | "ac\n", 357 | "ab\n", 358 | "ad\n", 359 | "cb\n", 360 | "cd\n", 361 | "bd\n", 362 | "\n", 363 | "Size = 4!/(2!(4-2)!) = 6\n" 364 | ] 365 | } 366 | ], 367 | "source": [ 368 | "# Print all the k-combinations of A\n", 369 | "choose_k = list(itertools.combinations(A,k))\n", 370 | "print(\"%i-combinations of %s:\\n\" %(k,A))\n", 371 | "for i in range(0, len(choose_k)):\n", 372 | " print(''.join(choose_k[i]) )\n", 373 | "print;print \"Size = %i!/(%i!(%i-%i)!) = \" %(n,k,n,k), len(choose_k)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 49, 379 | "metadata": {}, 380 | "outputs": [ 381 | { 382 | "name": "stdout", 383 | "output_type": "stream", 384 | "text": [ 385 | "6\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "# Print |A|!/(k!(|A|-k)!) directly\n", 391 | "print(int(factorial(len(A))/(factorial(k)*factorial(len(A)-k))))" 392 | ] 393 | } 394 | ], 395 | "metadata": { 396 | "kernelspec": { 397 | "display_name": "Python 2", 398 | "language": "python", 399 | "name": "python2" 400 | }, 401 | "language_info": { 402 | "codemirror_mode": { 403 | "name": "ipython", 404 | "version": 2 405 | }, 406 | "file_extension": ".py", 407 | "mimetype": "text/x-python", 408 | "name": "python", 409 | "nbconvert_exporter": "python", 410 | "pygments_lexer": "ipython2", 411 | "version": "2.7.13" 412 | }, 413 | "toc": { 414 | "colors": { 415 | "hover_highlight": "#DAA520", 416 | "navigate_num": "#000000", 417 | "navigate_text": "#333333", 418 | "running_highlight": "#FF0000", 419 | "selected_highlight": "#FFD700", 420 | "sidebar_border": "#EEEEEE", 421 | "wrapper_background": "#FFFFFF" 422 | }, 423 | "moveMenuLeft": true, 424 | "nav_menu": { 425 | "height": "48px", 426 | "width": "252px" 427 | }, 428 | "navigate_menu": true, 429 | "number_sections": true, 430 | "sideBar": true, 431 | "skip_h1_title": false, 432 | "threshold": 4, 433 | "toc_cell": false, 434 | "toc_position": {}, 435 | "toc_section_display": "block", 436 | "toc_window_display": false, 437 | "widenNotebook": false 438 | } 439 | }, 440 | "nbformat": 4, 441 | "nbformat_minor": 2 442 | } 443 | -------------------------------------------------------------------------------- /Week 4 Probability and Conditioning/Week_4_Part_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 4 Probability and Conditioning/Week_4_Part_1.pdf -------------------------------------------------------------------------------- /Week 4 Probability and Conditioning/Week_4_Part_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 4 Probability and Conditioning/Week_4_Part_2.pdf -------------------------------------------------------------------------------- /Week 5 Random Variables, Expectation, and Variance/4 Variable Modification.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hello and welcome back.\par 4 | So we're talking about expectations of random variables\par 5 | and what we're going to discuss first\par 6 | is what happens when you modify random variables\par 7 | and then we'll talk about expectations\par 8 | of these modifications.\par 9 | We want to discuss modifications\par 10 | of functions of random varables.\par 11 | Sometimes we're interested in\par 12 | not just the random variable itself,\par 13 | but some function of the random variable.\par 14 | For example if a person gets a salary,\par 15 | suppose it's some random number expressing dollars,\par 16 | which we call X then if they get a $10 raise\par 17 | then the salary now, instead of X, is X plus 10,\par 18 | which we might call y.\par 19 | Or if they get a 10% raise, then instead of salary X,\par 20 | the new salary is new random variable Y,\par 21 | which is 1.1 times X.\par 22 | Or if they become CEO then maybe\par 23 | the new salary Y is now X square.\par 24 | In all these cases we see that we had a random variable X\par 25 | and now we have a new random variable Y,\par 26 | which is some function of X.\par 27 | And so Y is some g of x for some function g,\par 28 | and you notice the g is a deterministic function.\par 29 | Defined over R, over the real numbers\par 30 | or whatever the domain of X is.\par 31 | Whatever the range of X is.\par 32 | But in all those cases again,\par 33 | you see that g is just a deterministic function.\par 34 | Like X plus 10, or 1.1 X, or X squared.\par 35 | It's a known function.\par 36 | And all the randomness in Y derives from X.\par 37 | So the only reason there's randomness\par 38 | about the new salaries is because there was some randomness\par 39 | about the original salary.\par 40 | As you can see, we're taking advantage of the fact\par 41 | that we're talking about random variables.\par 42 | We're talking about numbers,\par 43 | and we can now define functions over these variables.\par 44 | Okay so were going to see a few examples.\par 45 | So just to reiterate, X is deterministically modified by g.\par 46 | So g is deterministic.\par 47 | And X was random.\par 48 | And Y is g of X, so it's now random as well.\par 49 | So let's see a couple of examples.\par 50 | First is translation.\par 51 | So suppose we have a random variable X\par 52 | and we want to add the constant b to X.\par 53 | For example X two, add two, or something like that.\par 54 | So this is called translation.\par 55 | We are translating X, or we're moving X\par 56 | by this quantity b.\par 57 | So then Y is going to be X plus b.\par 58 | And here is an example.\par 59 | Here is X.\par 60 | It's distributed between one and four\par 61 | according to this distribution.\par 62 | And Y is X plus b.\par 63 | Maybe Y is X plus two.\par 64 | Then what happens is we just\par 65 | move the distribution of X by two.\par 66 | So, what was the probability of one before\par 67 | is now the probability of three.\par 68 | And probability of two becomes the probability of four.\par 69 | And so on.\par 70 | Because we just took X, whatever it was,\par 71 | and we added two to it.\par 72 | So just to say it a little more formally,\par 73 | what is the probability that the newer variable Y has,\par 74 | gets the value y.\par 75 | It's the probability because Y is X plus b.\par 76 | It's the probability that X plus b is equal to y.\par 77 | And that's the probability that X is equal to y minus b.\par 78 | So we can see here the probability that y is six,\par 79 | is the probability that X was two.\par 80 | The probability that y is five\par 81 | is the probability that X was three.\par 82 | And so on.\par 83 | The probability that y is three\par 84 | is the probability that X was one.\par 85 | Okay, so we see that we can relate the probabilities\par 86 | of X, the original X, to the probability of the new y.\par 87 | Using this function.\par 88 | Here's another example, scaling.\par 89 | So suppose we take X and we multiply it by a constant b.\par 90 | So we call it, we said that we scale X by a factor of b.\par 91 | Then Y is b times X.\par 92 | And here is our original X.\par 93 | And now let's multiply it by 1.5.\par 94 | Then the values of X originally\par 95 | were one, two, three, and four.\par 96 | Now they're going to be one times 1.5,\par 97 | which is 1.5.\par 98 | Or two times 1.5, which is three.\par 99 | On to the three we have 4.5.\par 100 | Instead of four we have six.\par 101 | And the probability that y is six\par 102 | is the probability that X was four, and so on.\par 103 | So I'll just again, writing it formally.\par 104 | The probability that Y is y,\par 105 | is the probability because Y is b times X,\par 106 | it's the probability that b times X is y.\par 107 | And that's the probability that X is y over b.\par 108 | Okay, and that's what we see here.\par 109 | The probability that y is six\par 110 | is the probability that X was six divided by 1.5.\par 111 | Or the probability that X was four.\par 112 | Now in these two examples,\par 113 | the function that we use b times X\par 114 | or X plus b was one to one.\par 115 | It mapped in different values of X,\par 116 | the different values of Y.\par 117 | But sometimes the function is not one to one,\par 118 | and things get a little more interesting.\par 119 | So let's look at the square function.\par 120 | And let's start in the range where the square is one to one.\par 121 | So let's say that we have a random variable X\par 122 | which is distribute of a zero one two.\par 123 | According to this probability,\par 124 | it's zero probability half.\par 125 | One with a probability of third.\par 126 | And two with probability one sixth.\par 127 | So you see that over the range zero one two,\par 128 | if we square those values,\par 129 | the square function is one to one.\par 130 | And specifically if we let Y equal to be X square,\par 131 | then y will get the value of zero square, which is zero.\par 132 | One square which is one.\par 133 | Or two square which is four.\par 134 | So these are the values.\par 135 | And what is the probability y will get those value,\par 136 | well y would be zero if X was zero\par 137 | which happens probability half.\par 138 | Y will be one if X was one.\par 139 | Which happens probability of one third.\par 140 | And y will be four if X was two.\par 141 | Which happens probability of one sixth.\par 142 | So these are the probabilities.\par 143 | But now let's look at the range\par 144 | where the square function is many to one.\par 145 | In particular let's look at the range\par 146 | where X varies from minus two to plus two.\par 147 | So minus two, minus one, all the way up to plus two.\par 148 | According to a uniform distribution.\par 149 | So X is gets each of those values probability of one fifth.\par 150 | Then Y again is X square.\par 151 | And now Y will have fewer values than X.\par 152 | Particularly it will have the same values\par 153 | that it would have before, zero one four.\par 154 | And let's see, what is the probability that y is zero.\par 155 | Y is zero if X was zero.\par 156 | And that happens probability one fifth.\par 157 | That's what we get here.\par 158 | Now more interestingly, y is one if X was either\par 159 | minus one, or plus one.\par 160 | Because in both cases, X squared is going to be one.\par 161 | And so X is minus one or one, with probability two fifths.\par 162 | And therefore the probability that y is one, is two fifths.\par 163 | And y is four if X was minus two or plus two\par 164 | because in both cases, X squared is going to be four.\par 165 | So y is going to be four.\par 166 | And X is minus two or plus two\par 167 | with probability one fifth plus one fifth,\par 168 | which is two fifths.\par 169 | So we see now that y is ranging over a smaller\par 170 | set of values.\par 171 | Three instead of five.\par 172 | And for each one, or at least for two of them,\par 173 | the probability of one of y comes from multiple values of X.\par 174 | All of them mapped to the same way.\par 175 | So let's see this in a picture.\par 176 | So here are the values of X.\par 177 | Zero, minus one, plus one,\par 178 | minus two, and plus two.\par 179 | And here is Y which is g of X,\par 180 | and our g of X, or g is X square.\par 181 | So Y is zero, one, and four.\par 182 | And now zero will map to zero.\par 183 | And minus one and plus one will both map to one\par 184 | by g or by X square.\par 185 | And minus two and plus two will both map to four.\par 186 | So when we look at the inverse mapping of zero of Y,\par 187 | we get the inverse mapping of zero is zero.\par 188 | The inverse mapping of one is minus one and plus one.\par 189 | The inverse mapping, or the inverse image of four\par 190 | is minus two and two.\par 191 | And the probability that Y is four,\par 192 | is the probability that X is in this inverse image of four.\par 193 | And the probability that Y is one,\par 194 | is the probability that X is in this inverse image of one.\par 195 | Which is minus one and one.\par 196 | So we can say therefore that the probability\par 197 | that Y is one, is the probability that g of X\par 198 | is equal to y.\par 199 | Because that's by definition, y is g of X.\par 200 | So supposedly g of X is equal to y.\par 201 | And taking the inverse mapping,\par 202 | it's the probability that X is in the inverse image of y.\par 203 | And what is the probability that X\par 204 | is in the inverse image of y for example here,\par 205 | minus two and two.\par 206 | It's just the sum of the probabilities of those Xs.\par 207 | So it's summation of all X in the inverse image of y\par 208 | of the probability of X.\par 209 | So for example, if we take four,\par 210 | and the probability that Y is four,\par 211 | is the summation of all Xs in the inverse image of four.\par 212 | Namely, minus two and two of the probabilities.\par 213 | So it's one fifth plus one fifth.\par 214 | Okay so as we see,\par 215 | when the mapping is many to one,\par 216 | then to calculate the probability of Y,\par 217 | we need to sum the probability of X\par 218 | in the inverse image of Y.\par 219 | So that's all there is to it.\par 220 | So we want to just introduce this concept\par 221 | of variable modifications.\par 222 | Or functions of random variables\par 223 | and next time we're going to look at\par 224 | the expectations of this modifications.\par 225 | See you then.\par 226 | End of transcript. Skip to the start.\par 227 | POLL\par 228 | \par 229 | Let\rquote s see if we have any risk takers in the class. Suppose you flip a fair coin. With each flip you must bet a certain amount of money. If the coin lands heads, you get double your money in return. If the coin lands tails, you lose your money. How much money would you be willing to bet during this game?\par 230 | \par 231 | \tab\par 232 | $0\par 233 | \par 234 | \tab\par 235 | $10\par 236 | \par 237 | \tab\par 238 | $100\par 239 | \par 240 | \tab\par 241 | $1000\par 242 | \par 243 | Submit\par 244 | } 245 | -------------------------------------------------------------------------------- /Week 5 Random Variables, Expectation, and Variance/8 Linearity of Expectations.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Today, I'm going to talk about Linearity of Expectations.\par 4 | And so, if we want to calculate\par 5 | the expected value of\par 6 | the sum of two random variables, x and y,\par 7 | we can just write it as the summation of all xy\par 8 | of the sum, x plus y times the probability,\par 9 | and that's going to be\par 10 | broken up, if you want,\par 11 | to the summation of over x times P xy,\par 12 | plus the summation of all y times P xy.\par 13 | And then we can see that we can take\par 14 | x does not change with y,\par 15 | so we can take it outside the y sum.\par 16 | So we get summation over x\par 17 | of this summation over y of P xy\par 18 | plus summation over y of y, which we took out.\par 19 | And then, inside, we have summation over all x\par 20 | of P xy, where here we exchange the order first\par 21 | and then took the y out.\par 22 | Now, when we sum over P xy,\par 23 | then by the rule of total probability,\par 24 | this will sum to P of x.\par 25 | So we get summation x, P of x.\par 26 | And here, we have a summation of all x of P xy.\par 27 | This will sum to P of y.\par 28 | So, this is going to give us summation over y\par 29 | of y times the probability of y.\par 30 | And this is nothing but the expected value of x.\par 31 | And the second sum is the expected value of y.\par 32 | So, what we see is that the expected value of x plus y\par 33 | is the expected value of x plus the expected value of y.\par 34 | Or, in words, we see that the expectation of the sum\par 35 | is the sum of expectations,\par 36 | and this has many applications,\par 37 | and we're going to actually have a video about that.\par 38 | Now, of course, the next natural thing to consider is\par 39 | whether the variance adds as well.\par 40 | So, the expectations add, namely,\par 41 | the expected value of x plus y\par 42 | is the expected value of x\par 43 | plus the expected value of y,\par 44 | and it's very natural to ask\par 45 | whether the variances add as well, namely,\par 46 | is the variance of x plus y\par 47 | the same as the variance of x plus the variance of y?\par 48 | Now, to figure out if this is correct,\par 49 | let us write the variance of the sum.\par 50 | So, the variance of x plus y\par 51 | is the expected value of x plus y squared\par 52 | minus the expected value of x plus y,\par 53 | the whole thing squared.\par 54 | That's out of the second formulation for the variance.\par 55 | It's the expected value of the random variable squared\par 56 | minus the expected value of the random variable,\par 57 | the whole thing squared.\par 58 | And so, this is the same as\par 59 | writing the expected value of when we square x plus y.\par 60 | It's the expected value of x squared\par 61 | plus 2 xy plus y squared,\par 62 | and here we have minus the expected value of x\par 63 | plus the expected value of y, the whole thing squared.\par 64 | And this, we can open up.\par 65 | We can say this the expected value of x squared\par 66 | plus twice the expected value of xy\par 67 | plus the expected value of y squared minus,\par 68 | and when we open this product,\par 69 | it's the expected value of x, the whole thing squared\par 70 | plus twice the expected value of x\par 71 | times the expected value of y\par 72 | plus the expected value of y,\par 73 | the whole quantity squared.\par 74 | So, we can regroup things, and we get\par 75 | this is the same as the expected value of x squared\par 76 | minus the expected value of x squared\par 77 | plus the expected value of y squared\par 78 | minus the square of the expected value of y\par 79 | plus twice the expected value of xy\par 80 | minus twice the expected value of x\par 81 | times the expected value of y.\par 82 | Now, the first two terms are the variance of x,\par 83 | and the next two terms are the variance of y,\par 84 | and what we're left with is\par 85 | twice the expected value of xy\par 86 | minus the expected value of x\par 87 | times the expected value of y.\par 88 | So, the answer as to whether the variance of x plus y\par 89 | is equal to the variance of x plus the variance of y\par 90 | depends on whether the expected value of xy\par 91 | equals the expected value of x\par 92 | times the expected value of y.\par 93 | So, this is the question that\par 94 | we really probably should have asked before.\par 95 | We saw that the expected value of x plus y\par 96 | is the sum of the expectation of the product,\par 97 | the product of the expectations,\par 98 | namely, do expectations multiply?\par 99 | So, that is an even more basic question\par 100 | than whether the variances add,\par 101 | and that's what we want to look at next.\par 102 | So, this is what we're going to do in a separate video\par 103 | because it would take us some time to discuss this.\par 104 | See you then.\par 105 | End of transcript. Skip to the start.\par 106 | POLL\par 107 | \par 108 | \par 109 | Which of the following equation(s) is/are true?\par 110 | \par 111 | a) E(X+2Y) = EX + 2EY \par 112 | b) E(X+Y2) = EX + (EY)2 \par 113 | \par 114 | \tab\par 115 | a\par 116 | \par 117 | \tab\par 118 | b\par 119 | \par 120 | \tab\par 121 | a, b\par 122 | \par 123 | \tab\par 124 | None of the above\par 125 | } 126 | -------------------------------------------------------------------------------- /Week 5 Random Variables, Expectation, and Variance/Week_5_Part_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 5 Random Variables, Expectation, and Variance/Week_5_Part_1.pdf -------------------------------------------------------------------------------- /Week 5 Random Variables, Expectation, and Variance/Week_5_Part_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 5 Random Variables, Expectation, and Variance/Week_5_Part_2.pdf -------------------------------------------------------------------------------- /Week 6 Discrete and Continuous Distribution/1 Distribution Families.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hello and welcome back.\par 4 | So we have introduced random variables,\par 5 | and in this lecture I thought I'll just start\par 6 | and present what we're going to talk about next.\par 7 | So in relative, the random variables typically belong\par 8 | to certain families of distributions.\par 9 | And as you can see on this slide,\par 10 | there are many distribution families.\par 11 | But don't worry, we are going to focus on\par 12 | the most natural and most important ones,\par 13 | those that are significant\par 14 | both theoretically and practically.\par 15 | So we'll talk about very few,\par 16 | and I'm going to quickly tell you what we'll do for them\par 17 | in this very brief presentation.\par 18 | So the distribution we'll talk about\par 19 | will be for the discrete distributions.\par 20 | We'll talk about the Bernoulli distribution,\par 21 | binomial distributions, Poisson,\par 22 | and geometric distributions.\par 23 | And then after that,\par 24 | we will introduce continuous distributions\par 25 | and then we'll discuss uniform, exponential,\par 26 | and normal distributions.\par 27 | So for each of those distributions,\par 28 | what we are going to do is we will motivate them\par 29 | and discuss some applications.\par 30 | We'll provide the formulation for this distribution\par 31 | or this distribution family.\par 32 | And then we'll visualize them and see what they look like.\par 33 | And we'll provide some examples,\par 34 | and then we'll describe some properties,\par 35 | and typically we'll describe the mean, the variance,\par 36 | and the standard deviations.\par 37 | And for some distributions\par 38 | we'll discuss other properties as well.\par 39 | And then the notebooks that you have\par 40 | have some Python implementations\par 41 | so you can further plot the distributions\par 42 | and experiment with them and do certain other things.\par 43 | Okay?\par 44 | And now, when you want to show\par 45 | that a certain function is a distribution,\par 46 | we need to establish that we know two things.\par 47 | We need to show that it's nonnegative\par 48 | and we need to show that the values sum to one.\par 49 | Showing that the values are nonnegative\par 50 | is typically obvious,\par 51 | and showing that it sums to one takes a little more effort.\par 52 | So to make it maybe slightly less boring\par 53 | we're going to borrow a page from a company called Blendtec.\par 54 | They make a product which is fairly standard\par 55 | and uninteresting.\par 56 | It's a blender.\par 57 | And so to attract more people to them,\par 58 | they came up with a campaign\par 59 | that features the founder, Tom Dickson.\par 60 | And it's a commercial campaign.\par 61 | It's called Will it Blend?\par 62 | And in this commercial, Tom Dickson, the founder,\par 63 | takes different things and he checks whether they'll blend.\par 64 | So for example,\par 65 | he checks whether a Nike shoe will blend,\par 66 | or whether a garden rake will blend,\par 67 | or whether an iPhone will blend,\par 68 | or whether a Justin Bieber doll will blend,\par 69 | and so on.\par 70 | And typically the answer is the same,\par 71 | and that is yes, these will blend.\par 72 | So we are just going to try to slightly mimic this\par 73 | and we will ask, when we want to see whether it sums to one,\par 74 | we will ask will it add?\par 75 | And we will see\par 76 | whether these things add or don't add to one.\par 77 | Next we're going to start with\par 78 | the first collection of distributions\par 79 | which will be the Bernoulli distribution.\par 80 | See you then.\par 81 | End of transcript. Skip to the start.\par 82 | Previous\par 83 | } 84 | -------------------------------------------------------------------------------- /Week 6 Discrete and Continuous Distribution/5 Geometric.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 POLL\par 4 | \par 5 | Which of the following distributions is memoryless?\par 6 | \par 7 | \tab\par 8 | Poisson distribution\par 9 | \par 10 | \tab\par 11 | Geometric distribution\par 12 | \par 13 | \tab\par 14 | Both\par 15 | \par 16 | \tab\par 17 | None of them\par 18 | } 19 | -------------------------------------------------------------------------------- /Week 6 Discrete and Continuous Distribution/6_conditional_probability_hw.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "nbgrader": { 7 | "grade": false, 8 | "locked": false, 9 | "solution": false 10 | } 11 | }, 12 | "source": [ 13 | " **IMPORTANT: ** When submitting this homework notebook, please modify only the cells that start with:<\/font>\n", 14 | "\n", 15 | "```python\n", 16 | "# modify this cell\n", 17 | "```" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# Conditional Probability and Baye's Rule\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Problem" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "There are two urns $A$ and $B$. Urn $A$ contains $r_A$ red balls and $w_A$ white balls whereas urn $B$ contains $r_B$ red balls and $w_B$ white balls. One of the urns is picked at random and then one ball is picked at random from this urn. Write a function **conditional_probability** that calculates the conditional probability that the randomly chosen ball belonged to urn $A$ given that it is white. Assume that $\\frac{r_A}{w_A}\\neq\\frac{r_B}{w_B}$.\n", 39 | "\n", 40 | " **Code**<\/font>\n", 41 | "```python\n", 42 | "rA, wA, rB, wB = 1., 2., 2., 1.\n", 43 | "conditional__probability(rA, wA, rB, wB) \n", 44 | "```\n", 45 | "\n", 46 | " **Output**<\/font>\n", 47 | "```\n", 48 | "0.6666666666666666\n", 49 | "```" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": true, 57 | "scrolled": true 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "# modify this cell\n", 62 | "\n", 63 | "def conditional__probability(rA, wA, rB, wB):\n", 64 | " # inputs: all of them are of type 'float'\n", 65 | " # output: a variable of type 'float'\n", 66 | " \n", 67 | " #\n", 68 | " # YOUR CODE HERE\n", 69 | " #\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": true, 77 | "nbgrader": { 78 | "grade": true, 79 | "grade_id": "ex1", 80 | "locked": true, 81 | "points": "5", 82 | "solution": false 83 | } 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "assert( abs(conditional__probability(2., 4., 3., 3.) -0.5714285714285715) < 10**-5) \n", 88 | "assert( abs(conditional__probability(1., 3., 5., 2.) -0.7241379310344829) < 10**-5) \n", 89 | "\n", 90 | "#\n", 91 | "# AUTOGRADER TEST - DO NOT REMOVE\n", 92 | "#\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "\n", 104 | "\n", 105 | "\n", 106 | "\n", 107 | "\n", 108 | "\n", 109 | "\n" 110 | ] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 2", 116 | "language": "python", 117 | "name": "python2" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 2 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text\/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython2", 129 | "version": "2.7.12" 130 | }, 131 | "toc": { 132 | "colors": { 133 | "hover_highlight": "#DAA520", 134 | "navigate_num": "#000000", 135 | "navigate_text": "#333333", 136 | "running_highlight": "#FF0000", 137 | "selected_highlight": "#FFD700", 138 | "sidebar_border": "#EEEEEE", 139 | "wrapper_background": "#FFFFFF" 140 | }, 141 | "moveMenuLeft": true, 142 | "nav_menu": { 143 | "height": "48px", 144 | "width": "252px" 145 | }, 146 | "navigate_menu": true, 147 | "number_sections": true, 148 | "sideBar": true, 149 | "threshold": 4, 150 | "toc_cell": false, 151 | "toc_section_display": "block", 152 | "toc_window_display": false, 153 | "widenNotebook": false 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 2 158 | } -------------------------------------------------------------------------------- /Week 6 Discrete and Continuous Distribution/Problem Set 6 _ 6.12 Problem Sets _ DSE210x Courseware _ edX.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Problem Set 6 _ 6.12 Problem Sets _ DSE210x Courseware _ edX.pdf -------------------------------------------------------------------------------- /Week 6 Discrete and Continuous Distribution/Programming Assignment _ 6.13 Programming Assignment _ DSE210x Courseware _ edX.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Programming Assignment _ 6.13 Programming Assignment _ DSE210x Courseware _ edX.pdf -------------------------------------------------------------------------------- /Week 6 Discrete and Continuous Distribution/Quiz 6 _ 6.11 Comprehension Quiz _ DSE210x Courseware _ edX.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Quiz 6 _ 6.11 Comprehension Quiz _ DSE210x Courseware _ edX.pdf -------------------------------------------------------------------------------- /Week 6 Discrete and Continuous Distribution/Week_6_Part_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Week_6_Part_1.pdf -------------------------------------------------------------------------------- /Week 6 Discrete and Continuous Distribution/Week_6_Part_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Week_6_Part_2.pdf -------------------------------------------------------------------------------- /Week 6 Discrete and Continuous Distribution/Week_6_Part_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Week_6_Part_3.pdf -------------------------------------------------------------------------------- /Week 6 Discrete and Continuous Distribution/Week_6_Part_4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 6 Discrete and Continuous Distribution/Week_6_Part_4.pdf -------------------------------------------------------------------------------- /Week 7 Inequalities and Limit Theorems/Problem Set 7 _ 7.9 Problem Sets _ DSE210x Courseware _ edX.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 7 Inequalities and Limit Theorems/Problem Set 7 _ 7.9 Problem Sets _ DSE210x Courseware _ edX.pdf -------------------------------------------------------------------------------- /Week 7 Inequalities and Limit Theorems/Programming Assignment _ 7.10 Programming Assignment _ DSE210x Courseware _ edX.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 7 Inequalities and Limit Theorems/Programming Assignment _ 7.10 Programming Assignment _ DSE210x Courseware _ edX.pdf -------------------------------------------------------------------------------- /Week 7 Inequalities and Limit Theorems/Quiz 7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 7 Inequalities and Limit Theorems/Quiz 7.pdf -------------------------------------------------------------------------------- /Week 7 Inequalities and Limit Theorems/inequalities_HW.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | " **IMPORTANT: ** When submitting this homework notebook, please modify only the cells that start with:<\/font>\n", 8 | "\n", 9 | "```python\n", 10 | "# modify this cell\n", 11 | "```" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "**Note:** notice that no packages are imported for this assignment. This is because you do not need any python packages." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "# Probability Inequalities\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "For the binomial distribution $X\\sim B_{p,n}$ with mean $\\mu=np$ and variance $\\sigma^2=np(1-p)$, \n", 33 | "we would like to upper bound the probability $P(X\\ge c\\cdot \\mu)$ for $c\\ge1$. \n", 34 | "The lectures introduced three bounds:\n", 35 | "\n", 36 | "Markov: $$P(X\\ge \\alpha\\mu)\\le \\frac{1}{\\alpha},\\quad\\quad\\forall \\alpha\\ge 1,$$\n", 37 | "Chebyshev: $$P(|X-\\mu|\\ge \\alpha\\sigma)\\le \\frac{1}{\\alpha^2},\\quad\\quad \\forall \\alpha\\ge 1,$$\n", 38 | "Note that, while double-sided, this inequality also bounds $P(X\\ge\\mu+\\alpha)$\n", 39 | "$$P(X\\ge \\mu+\\alpha\\sigma)\\le P(|X-\\mu|\\ge \\alpha\\sigma)\\le \\frac{1}{\\alpha^2},$$\n", 40 | "Chernoff: $$P(X\\ge (1+\\delta)\\mu)\\le e^{-\\frac{\\delta^2}{2+\\delta}\\mu},\\quad\\quad\\forall \\delta\\ge0.$$\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "from math import exp" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "import exponential function exp from math" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Problem 1" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "Write three functions **Markov**, **Chebyshev** and **Chernoff** that take $n$, $p$ and $c$ as inputs and return the upper bounds for $P(X\\ge c\\cdot np)$ given by the above Markov, Chebyshev, and Chernoff inequalities as outputs.\n", 73 | "\n", 74 | " **Code:**<\/font>\n", 75 | "```python\n", 76 | "print Markov(100.,0.2,1.5)\n", 77 | "print Chebyshev(100.,0.2,1.5)\n", 78 | "print Chernoff(100.,0.2,1.5)\n", 79 | "```\n", 80 | "\n", 81 | "\n", 82 | " **Output**<\/font>\n", 83 | "```\n", 84 | "0.6666666666666666\n", 85 | "0.16\n", 86 | "0.1353352832366127\n", 87 | "\n", 88 | "```" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": true 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "\n", 100 | "# modify this cell\n", 101 | "\n", 102 | "def Markov(n, p, c):\n", 103 | " # inputs: 3 floats as described above\n", 104 | " # output: a variable of type float\n", 105 | " \n", 106 | " #\n", 107 | " # YOUR CODE HERE\n", 108 | " #\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "\n", 120 | "# modify this cell\n", 121 | "\n", 122 | "def Chebyshev(n, p, c):\n", 123 | " # inputs: 3 floats as described above\n", 124 | " # output: a variable of type float\n", 125 | " \n", 126 | " #\n", 127 | " # YOUR CODE HERE\n", 128 | " #\n" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "\n", 140 | "# modify this cell\n", 141 | "\n", 142 | "def Chernoff(n, p, c):\n", 143 | " # inputs: 3 floats as described above\n", 144 | " # output: a variable of type float\n", 145 | " \n", 146 | " #\n", 147 | " # YOUR CODE HERE\n", 148 | " #\n" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "nbgrader": { 156 | "grade": true, 157 | "grade_id": "ex1", 158 | "locked": true, 159 | "points": "5", 160 | "solution": false 161 | } 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "assert (Markov(200.,0.25,1.25)-0.8)< 10**-5\n", 166 | "assert (Chebyshev(100.,0.25,1.25)-0.48)< 10**-5\n", 167 | "assert (Chernoff(100.,0.25,1.25)-0.4993517885992762)< 10**-5\n", 168 | "#\n", 169 | "# AUTOGRADER TEST - DO NOT REMOVE\n", 170 | "#\n" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "\n", 182 | "\n", 183 | "\n", 184 | "\n", 185 | "\n", 186 | "\n", 187 | "\n", 188 | "\n" 189 | ] 190 | } 191 | ], 192 | "metadata": { 193 | "kernelspec": { 194 | "display_name": "Python 2", 195 | "language": "python", 196 | "name": "python2" 197 | }, 198 | "language_info": { 199 | "codemirror_mode": { 200 | "name": "ipython", 201 | "version": 2 202 | }, 203 | "file_extension": ".py", 204 | "mimetype": "text\/x-python", 205 | "name": "python", 206 | "nbconvert_exporter": "python", 207 | "pygments_lexer": "ipython2", 208 | "version": "2.7.12" 209 | }, 210 | "toc": { 211 | "colors": { 212 | "hover_highlight": "#DAA520", 213 | "navigate_num": "#000000", 214 | "navigate_text": "#333333", 215 | "running_highlight": "#FF0000", 216 | "selected_highlight": "#FFD700", 217 | "sidebar_border": "#EEEEEE", 218 | "wrapper_background": "#FFFFFF" 219 | }, 220 | "moveMenuLeft": true, 221 | "nav_menu": { 222 | "height": "48px", 223 | "width": "252px" 224 | }, 225 | "navigate_menu": true, 226 | "number_sections": true, 227 | "sideBar": true, 228 | "threshold": 4, 229 | "toc_cell": false, 230 | "toc_section_display": "block", 231 | "toc_window_display": false, 232 | "widenNotebook": false 233 | } 234 | }, 235 | "nbformat": 4, 236 | "nbformat_minor": 2 237 | } -------------------------------------------------------------------------------- /Week 8 Statistics and Parameter Estimation/1. Stats.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hello, and welcome back.\par 4 | So far, we talked about probability\par 5 | where everything was designed by us.\par 6 | Like, we said, okay, this is a uniform distribution,\par 7 | it behaves exactly in that way.\par 8 | Or this is a geometric distribution or binomial.\par 9 | Everything was very clean and very precise\par 10 | and behaved exactly the way we wanted it.\par 11 | And now, we're going to move to the real world\par 12 | where things are not exactly the way they're planned.\par 13 | They look a little different, they behave\par 14 | a little differently, sometimes not what we what we expect\par 15 | or not even what we want, but still\par 16 | we need to deal with them.\par 17 | So, that's what we're going to do\par 18 | in the statistics part of this course.\par 19 | So, in some sense, probability and statistics\par 20 | are somewhat of opposites of each other.\par 21 | In probability, as we said, we assume some distribution,\par 22 | we come up with it, and this,\par 23 | and then we take samples and we say,\par 24 | here are the properties of the samples that we take.\par 25 | For example, we have a distribution,\par 26 | we can define the mean, mu, to be the summation\par 27 | of X times P of X, and then we say that if we take\par 28 | many samples, then the average value\par 29 | is going to be roughly mu.\par 30 | Or, if we have a distribution over values\par 31 | that are non-negative, so the random variable\par 32 | will be non-negative, then as we saw for Markov's\par 33 | inequality, the probability that we'll get a value\par 34 | which is bigger than twice the mean,\par 35 | we calculate it here, is at most half.\par 36 | And in statistics, it's, as we said,\par 37 | a little bit the opposite.\par 38 | We get samples, and from the samples we want\par 39 | to deduce properties of the distribution,\par 40 | or maybe what the distribution is.\par 41 | So we want to deduce some parameters\par 42 | of the distribution, for example, the mean\par 43 | or the standard deviation, or we want to say\par 44 | what type of distribution it is.\par 45 | Is it Gaussian, is it geometric,\par 46 | or maybe it's none of those.\par 47 | The first thing we're going to do\par 48 | is we're going to look at estimating\par 49 | distribution parameters.\par 50 | So, most distribution families that we saw\par 51 | are determined by parameters.\par 52 | For example, if we have Bernoulli distribution,\par 53 | it's determined by the success probability, P.\par 54 | If we have a binomial distribution,\par 55 | it's determined by the same P and also the number\par 56 | of samples that we take.\par 57 | If we have Poisson distribution,\par 58 | it's determined by the parameter lambda,\par 59 | which is the mean, and so on.\par 60 | If you have a geometric or a uniform\par 61 | or exponential or normal distribution,\par 62 | all of them are determined by parameters.\par 63 | So these are distribution parameters.\par 64 | But, you can view parameters more generally\par 65 | to be any deterministic function of the distribution.\par 66 | And sometimes these are called properties.\par 67 | So for example you can say that the mean,\par 68 | if you have a distribution, for example,\par 69 | binomial PN, then the mean is a parameter\par 70 | of the distribution, in this case it's P times N.\par 71 | Or the variance of the distribution here it's going\par 72 | to be NPQ, that's something that's determined\par 73 | by the distribution and so it's a parameter\par 74 | of the distribution.\par 75 | Or, the standard deviation, or we can take\par 76 | the min or max values.\par 77 | For example, for a geometric distribution,\par 78 | then the smallest value, the min value\par 79 | is the smallest value that has positive probability,\par 80 | which will be one, and the maximum value will be infinity.\par 81 | Or we can look at the mode, which is the value\par 82 | that has the highest probability.\par 83 | All of these are determined by the distribution\par 84 | of interest, and you might want to find\par 85 | what they are from from samples, okay, or the median.\par 86 | So the way we do it is by sampling\par 87 | from the distribution.\par 88 | So a distribution could be discrete,\par 89 | in which case we call it P,\par 90 | it's a probability mass function.\par 91 | Or distribution can be continuous, in which case\par 92 | we have a probability density function, F.\par 93 | And then we're going to take independent samples\par 94 | from P or from F.\par 95 | And we denote the samples by X superscript N,\par 96 | it's short for X one, X two up to X N, which as we said\par 97 | they're chosen independently from P.\par 98 | So they're chosen from P and independently of each other.\par 99 | And from these samples we want to deduce\par 100 | properties of the distribution.\par 101 | Or, instead of looking at distribution,\par 102 | what we often want to do is look at populations.\par 103 | So, a population is a collection of objects,\par 104 | typically many of them.\par 105 | For example we can take all students at UCSD\par 106 | so that's a population of students.\par 107 | Or all patients in a hospital, that's a population.\par 108 | And we want to deduce properties of this population.\par 109 | So what we do is we sample N objects\par 110 | from this collection of object.\par 111 | And typically N is much smaller\par 112 | than the population size, so we don't want to take\par 113 | all students at UCSD but we want to take\par 114 | a small sample, called it number N,\par 115 | and from that sample we want to deduce\par 116 | properties of the population as a whole.\par 117 | So in this case we pick for example N students at random,\par 118 | and we want to, as we said, to deduce population parameter\par 119 | from the samples.\par 120 | For example, maybe you want to deduce the average height\par 121 | of all students at UCSD by just sampling 100 of them.\par 122 | And so we can view, so this might look like\par 123 | it's a different problem, because here\par 124 | we have a physical population that we're sampling from.\par 125 | It might look like it's a different problem\par 126 | from estimating parameters of a distribution.\par 127 | But in fact we can view it as the same.\par 128 | So, we can view, for example, if we're looking\par 129 | at the heights, we can view the collection\par 130 | of heights as a distribution.\par 131 | So we have maybe, a person whose height is, you know,\par 132 | five foot, or five foot one, and another person's height\par 133 | is five feet, and so on.\par 134 | So we have all these, and now what we're going to do\par 135 | is we're going to sample from them.\par 136 | So we sample from the population is like sampling\par 137 | from the distribution that has so many,\par 138 | so many people with given height,\par 139 | and so many people with another height, and so on.\par 140 | So we're just sampling from them uniformly\par 141 | from this collection.\par 142 | And there's a small difference\par 143 | between this and this sampling that we had\par 144 | in the previous slide, in the sense that\par 145 | before they were IID and here,\par 146 | if we're sampling from the population,\par 147 | then we're picking a set of people,\par 148 | and these people are going to be distinct.\par 149 | So, it's not exactly IID.\par 150 | For example, if you have a population of size two,\par 151 | and you pick two of them, you know that\par 152 | they are different, and you couldn't pick\par 153 | the same person twice.\par 154 | But, in what we're going to look at\par 155 | N, the number of samples that you pick\par 156 | is much smaller than the population size.\par 157 | And in that case, the probability of repeats\par 158 | if you pick them independently will be\par 159 | fairly small, so we can view what we have\par 160 | as roughly independent.\par 161 | So under this assumption that N the sample size\par 162 | is much smaller than the population,\par 163 | then the selection, even though we're selecting\par 164 | without replacement will be very similar\par 165 | to selecting with replacement, because the probability\par 166 | that we'll get repeats is small.\par 167 | If we get repeats, there will be very few of them.\par 168 | So, with this assumption we can therefore\par 169 | assume that we have the same problem of estimating\par 170 | parameters of population as estimating\par 171 | parameters of a distribution.\par 172 | So, when we have this sample, then we're going\par 173 | to look at the sample and we're going to look\par 174 | at functions of the data.\par 175 | For example, the average of all the values\par 176 | that we get from the data,\par 177 | or the maximum value that we observe.\par 178 | And any function of the data\par 179 | is going to be called a statistic.\par 180 | What we want to do is we want to use the statistics\par 181 | to infer properites of the distribution or the population.\par 182 | So I want to look for example at the average\par 183 | or maximum value that we observed in our sample,\par 184 | and from these things we want to deduce\par 185 | some properties of the distribution or the population.\par 186 | And for example, we may want to deduce\par 187 | the parameter, like the mean of the distribution\par 188 | or the maximum of all elements in the population, and so on.\par 189 | Or we may want to deduce the type\par 190 | of distribution that's in effect.\par 191 | What we're going to do in the rest\par 192 | of the presentations in this sequence\par 193 | is see how to do this, and how to do this well.\par 194 | And so this was just a brief introduction\par 195 | to what we're going to discuss in the next presentations.\par 196 | Next, we're going to talk about possibly\par 197 | the simplest problem, which is estimating\par 198 | the mean of a distribution.\par 199 | And so that's what we're going to do next time,\par 200 | and see you then.\par 201 | End of transcript. Skip to the start.\par 202 | POLL\par 203 | } 204 | -------------------------------------------------------------------------------- /Week 8 Statistics and Parameter Estimation/Problem Set 8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 8 Statistics and Parameter Estimation/Problem Set 8.pdf -------------------------------------------------------------------------------- /Week 8 Statistics and Parameter Estimation/Programming Assignment 8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 8 Statistics and Parameter Estimation/Programming Assignment 8.pdf -------------------------------------------------------------------------------- /Week 8 Statistics and Parameter Estimation/Quiz 8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 8 Statistics and Parameter Estimation/Quiz 8.pdf -------------------------------------------------------------------------------- /Week 8 Statistics and Parameter Estimation/Week_8_Part_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 8 Statistics and Parameter Estimation/Week_8_Part_1.pdf -------------------------------------------------------------------------------- /Week 8 Statistics and Parameter Estimation/Week_8_Part_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 8 Statistics and Parameter Estimation/Week_8_Part_2.pdf -------------------------------------------------------------------------------- /Week 9 Regression and PCA/3. Solving a System of Linear Equations.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hi, last time we talked about matrices,\par 4 | and now we will see the first use of matrices\par 5 | for solving a system of linear equations.\par 6 | So linear equations, the simplest case\par 7 | of system of linear equation\par 8 | is two equations with two unknowns.\par 9 | And that corresponds to finding a line\par 10 | that passes through two points.\par 11 | So we're given two points in the plane,\par 12 | minus one, two, and one, one.\par 13 | And we want to find the line\par 14 | that passes through these points.\par 15 | So here is the picture, here is our plane,\par 16 | and this is the point two, minus one.\par 17 | And this is the point one, one.\par 18 | And we want to find the line that would be\par 19 | a straight line going through these two points.\par 20 | So any line, other than the vertical line,\par 21 | can be expressed using this expression,\par 22 | y equals w zero, plus w one x,\par 23 | where w zero and w one are some numbers,\par 24 | and x and y are the points along the line\par 25 | connecting these points.\par 26 | W zero is the intercept with the y-axis,\par 27 | and y one is the slope of the line.\par 28 | So to identify the line that passes through the two points,\par 29 | we need to find the w zero and w one\par 30 | that satisfy the two constraints\par 31 | determined by the two points.\par 32 | So for the point minus one, two,\par 33 | if we write the expression, we see that w zero minus w one,\par 34 | should be equal to two.\par 35 | And for the point one, one, we see that w zero plus w one,\par 36 | should be equal to one.\par 37 | So these are two equations with two unknowns,\par 38 | probably know how to solve this directly,\par 39 | but let's do it through matrices\par 40 | because that will generalize.\par 41 | So we want to write these equations in matrix form.\par 42 | So here is the way that we can write it,\par 43 | we can write the matrix, one, minus one,\par 44 | one, one, times the vector, w zero, w one,\par 45 | is equal to the vector, two, one.\par 46 | If you basically write out the two dot products\par 47 | that correspond to the first row and the second row,\par 48 | you see that you get exactly the equations\par 49 | that we had before, and now we can just\par 50 | represent this as, a w times b,\par 51 | a times w, equals b.\par 52 | If we basically call this a, this w, and this b.\par 53 | So this is what we do.\par 54 | So a is called the matrix of coefficient,\par 55 | b is the ordinate or dependent variable vector,\par 56 | and the parameter vector is w.\par 57 | It's this part that we don't know.\par 58 | So how can we find w?\par 59 | So we want to find the w, such that a w equal to b.\par 60 | Now if a is invertible, so a is square,\par 61 | but if it is also invertible, which it is in this case,\par 62 | then we can multiply both sides by a to the minus one,\par 63 | and we get that a to the minus one, times a w,\par 64 | is equal a to the minus one, times b.\par 65 | But that's also equal, on the left here,\par 66 | is equal to w, because a to the minus one, times a\par 67 | is the identity matrix.\par 68 | So here is how this looks when we use numpy.\par 69 | We have the matrix a and the column vector b,\par 70 | and we want to find the inverse of a.\par 71 | So inverse of a, we just use the command inv(A)\par 72 | and we find the inverse,\par 73 | and then we check that it is indeed the inverse,\par 74 | so if you multiply a times the inverse of a,\par 75 | we get the identity matrix.\par 76 | And then the solution is inverse of a, times b.\par 77 | And so what we get here is that the solution\par 78 | is 1.5, minus 0.5, so that's the vector w\par 79 | that we need to use.\par 80 | Alternatively, we can just use the solve operation\par 81 | in numpy, and that will give us the solution directly.\par 82 | Now that we have the vector w, we want to define the line\par 83 | that is represented by w, and as we've said,\par 84 | this is simply w zero, plus w one, times x.\par 85 | So we define a function\par 86 | that gives you the value y for every value of x,\par 87 | w is fixed here, and what we see is that if we basically\par 88 | write f of minus one, it's two,\par 89 | and f of one is one, so indeed we see that the line\par 90 | goes through the two points we wanted it to go through.\par 91 | And when we draw it, we can see\par 92 | that this is indeed the case.\par 93 | So we found the line that goes through\par 94 | the two points that we were given.\par 95 | Now that was two points, but what about\par 96 | if we have more than two points?\par 97 | So in general, if you have more than two points\par 98 | on the two-dimensional plane,\par 99 | there is no line that goes through these points.\par 100 | So here is an example.\par 101 | Here is one, two points that we had before,\par 102 | and here is a third point.\par 103 | Now this third point doesn't lie\par 104 | on the line connecting these two,\par 105 | so there is no line that would go through all of them.\par 106 | Still, many times we do want to solve problems\par 107 | that have more points than we have dimensions.\par 108 | So when the number of points is larger\par 109 | than the number of dimensions,\par 110 | we say that the system is over-determined.\par 111 | That means that there is no line\par 112 | that goes exactly through the points.\par 113 | However, we do still want to find a line\par 114 | that passes close to the points.\par 115 | So that's what we're going to talk about in the next video.\par 116 | See you then.\par 117 | End of transcript. Skip to the start.\par 118 | POLL\par 119 | } 120 | -------------------------------------------------------------------------------- /Week 9 Regression and PCA/4. Linear Regression.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Hi, last time we talked about\par 4 | finding a line that passes through two points on the plane.\par 5 | And we raised the question at the end about\par 6 | what about having more than two points.\par 7 | Can we find a line that passes\par 8 | close to these points?\par 9 | Okay, so that's basically the idea of regression and\par 10 | this video is going to introduce you to the\par 11 | notion of regression and\par 12 | how we solve it using numpy.\par 13 | So here's a small example,\par 14 | we have nine points on the plane,\par 15 | defined by their x, y positions,\par 16 | so here are our points, okay?\par 17 | So these are the points, and clearly there is no single line\par 18 | that passes through all of these points.\par 19 | But, also clearly, there is a line that passes very close to\par 20 | all of them, okay this line that is tending upwards.\par 21 | So, how can we find that line?\par 22 | So the line is going to be defined as before,\par 23 | by a function of the form w zero plus w one times x.\par 24 | And we want to find w zero and w one.\par 25 | So previously, we saw how to find that when\par 26 | there are just two points, and so there is a line\par 27 | that passes exactly through the points,\par 28 | and then it was just matrix inversion.\par 29 | Here, there's more than two points,\par 30 | and the system is overconstrained, there is no straight line\par 31 | that passes through all the points.\par 32 | So, while they don't fall,\par 33 | there's no line that falls exactly\par 34 | on all of the points, we can find a line\par 35 | that will be close to the points,\par 36 | but we need to define somehow what we mean by close.\par 37 | So what we are going to use\par 38 | is this idea of square difference.\par 39 | Okay, so for every point, xi yi, we're going to calculate\par 40 | the value of the line at that x,\par 41 | and then take the difference from that and y,\par 42 | which is the actual position of the point,\par 43 | and square that, why do we square it?\par 44 | Because we always want to be positive,\par 45 | if we're not exactly at the point.\par 46 | If we're exactly at the point, we have zero.\par 47 | So, we want this, this cost,\par 48 | the square cost to be a function that is bigger and bigger\par 49 | the further you are from the points,\par 50 | and our goal is to find a minimum.\par 51 | So, this method of looking for,\par 52 | for minimizing the squared difference\par 53 | is called the least square method,\par 54 | and we are going to look for the least square solution.\par 55 | Okay, so we're going to use matrix notation,\par 56 | and we're going to use numpy linalg,\par 57 | the library, to find this optimal vector w,\par 58 | that minimizes the square error.\par 59 | So, we're going to use the following matrices,\par 60 | first we're going to define A to be this matrix,\par 61 | where there is one column that is just all ones,\par 62 | and the second column is the x values,\par 63 | and then we're going to have\par 64 | a column vector that is all the y values,\par 65 | and then we're going to have a small column vector\par 66 | with just two values, that is going to be the weight\par 67 | vector that we're looking for.\par 68 | So, now we can find, we can define the errors\par 69 | to be aw, which gives us the,\par 70 | the y vectors as measured by fx,\par 71 | as computed by fx, minus y, that's the difference,\par 72 | and that's, now, it's the difference vector.\par 73 | And what we're interested is in the sum of the squares\par 74 | of the differences, and that turns out to be\par 75 | exactly the square of the norm of d,\par 76 | remember the norm is the length of d, we're looking for\par 77 | the w that will make d as short as possible.\par 78 | Okay, so how do we do this in numpy?\par 79 | We define the vectors a and y, as I said,\par 80 | I'm just printing here a transposed and y transposed,\par 81 | so that they fit nicely in the slide.\par 82 | And, then we just call the,\par 83 | the function in numpy called least square,\par 84 | give it the matrix a, and the vector y,\par 85 | and we're just interested in the perimeter,\par 86 | so we're going to take the first component\par 87 | of the answer from this, and that's the vector w,\par 88 | and if we print out w we see that it's 19,\par 89 | and 0.7166 and so on, okay, so this is w zero,\par 90 | the offset, and this is the slope.\par 91 | And if we now plot this line\par 92 | on the previous graph,\par 93 | we see that it is indeed like the line that we expected,\par 94 | and the little green segments here\par 95 | represent the errors, the differences,\par 96 | so you see that for most of the points\par 97 | the difference is very small,\par 98 | for some points the difference is significantly larger.\par 99 | But, this is the line that would minimize\par 100 | the total of the square of the lengths\par 101 | of these green lines.\par 102 | Okay, so that was a toy example,\par 103 | just to show you how you do this kind of thing,\par 104 | in a small number of examples where you can\par 105 | essentially see everything and,\par 106 | and it makes sense.\par 107 | In real life, we usually have not just\par 108 | nine or 10 or 20 points,\par 109 | we have a huge number of points, and we want to\par 110 | find a line that passes through close to the,\par 111 | to all of these points.\par 112 | So, here is a real data set,\par 113 | which has 25,000 people, their height and their weight.\par 114 | The height in inches, and the weight in pounds.\par 115 | And, if we solve\par 116 | the least square for\par 117 | this data set, what we get is the following,\par 118 | is, this is, these are our points,\par 119 | you see it's now a cloud of points,\par 120 | we have a huge number of points,\par 121 | and the red line is the best line that passes through\par 122 | those points, so what this red line tells us,\par 123 | is, not surprisingly, that as weight,\par 124 | as the height increases, the weight also tends to increase.\par 125 | But, this is by no means explaining all of the variation\par 126 | in the weight, for the same, for the same height\par 127 | you have a big variation in the,\par 128 | in the weight of the person.\par 129 | Okay, but this is the, what we would call\par 130 | the linear regression line.\par 131 | So, to get to the slightly\par 132 | more refined understanding of what this line tells us,\par 133 | it is useful to draw what is called the graph of averages.\par 134 | So this is the graph of averages,\par 135 | and basically what I've done is that I split the\par 136 | height into many ranges,\par 137 | of about one inch I think,\par 138 | and for each of these ranges, I found the,\par 139 | the mean, the mean value, and that mean value\par 140 | is the red dot, so that is called the graph of averages,\par 141 | and what you see from this graph of averages\par 142 | is that the tendency described by the,\par 143 | by the, the line that we found,\par 144 | is actually well-represented,\par 145 | well-representing the points of the averages,\par 146 | so the averages are indeed going, increasing linearly.\par 147 | So remember, this, this black line,\par 148 | we found it according, by minimizing the square error\par 149 | for all of the points, but it passes close to\par 150 | the center for these points.\par 151 | And only in the edges where we have very few examples,\par 152 | do we have significant deviation from that,\par 153 | okay, so if we ignore these very,\par 154 | very short or very tall people,\par 155 | that are outliers, we see that the linear graph\par 156 | is a good representation of the graph of averages.\par 157 | We'll see that, in some other cases, this is not the case.\par 158 | The, for every problem that we do in two dimensions,\par 159 | like here from weight, from height to weight,\par 160 | or vice versa, we have two regression lines.\par 161 | One is for, to predict\par 162 | the weight from the height,\par 163 | so if I give you the height of the person,\par 164 | I can find a function that is this straight line\par 165 | that will predict the weight,\par 166 | but I can also do it the other way, I can predict the\par 167 | height from the weight, okay?\par 168 | So, and the two functions will not coincide,\par 169 | so, here is the result of doing that,\par 170 | you see this is our red line,\par 171 | this red line is what we had, the previous line\par 172 | for predicting the weight from the height,\par 173 | the black line is predicting the height from the weight,\par 174 | so if you give me the, the weight,\par 175 | I will predict the height that is associated with it,\par 176 | so you see that these are two different lines,\par 177 | depending on what it is that we're trying to predict,\par 178 | and what we're using to predict it.\par 179 | So, in the next video,\par 180 | we're going to talk about polynomial regression,\par 181 | and that is a case in which\par 182 | linear regression might not be good enough,\par 183 | and we need to do something a little bit more sophisticated.\par 184 | So I'll see you then.\par 185 | End of transcript. Skip to the start.\par 186 | POLL\par 187 | } 188 | -------------------------------------------------------------------------------- /Week 9 Regression and PCA/5. Polynomial Regression.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - Last time, we talked about linear regression,\par 4 | and we fit a straight line into the data, okay?\par 5 | This time we're going to look at fitting\par 6 | more complicated curves into the data,\par 7 | and we're going to use for that polynomial regression.\par 8 | Okay, so let's get started with a little review.\par 9 | When we had our previous video, we looked at this graph\par 10 | of averages; and we saw that the averages, the red dots,\par 11 | fall along a straight line most of the time.\par 12 | At the extremes they fall, they might deviate\par 13 | because there is so little data.\par 14 | Okay, but most of the time\par 15 | they're very close to the straight line.\par 16 | But we might get data that looks more like this, okay?\par 17 | So here the averages, the red dots,\par 18 | are not really along a straight line.\par 19 | So, it's not going to work very well\par 20 | if we try to do linear regression.\par 21 | So there's nothing stopping us from doing linear regression;\par 22 | we will just get poor results.\par 23 | So this is what we'll get.\par 24 | Here is the straight line, and you see that it's\par 25 | really doesn't capture the shape of the data, okay?\par 26 | So how do we try to capture the shape\par 27 | of this non-linear data?\par 28 | We can try a second degree polynomial, okay?\par 29 | So I'm not going into what polynomials are.\par 30 | You can go to the notebook, and they'll give you\par 31 | some pointers; but this is second degree polynomial.\par 32 | It's basically similar to the first degree polynomial\par 33 | in the first two terms, but then it has a second,\par 34 | a third term that is W2 times X squared, okay?\par 35 | And now we want to fit all\par 36 | of these three parameters to the data.\par 37 | So we do that, again, using a least square;\par 38 | and the only thing that is really different is that\par 39 | instead of just having ones and the values\par 40 | of the Y values, we have also the Y values squared.\par 41 | Okay, and once we have that, we can just use the same\par 42 | least square function to find these, the W0, W1, and W2;\par 43 | and we can plot them, and we see that now\par 44 | with this second degree polynomial, we got a very nice\par 45 | and smooth curves that goes through,\par 46 | again, most of the data, okay?\par 47 | So we can be happy about that.\par 48 | Okay, so now here is an interesting question.\par 49 | You're given some data and you want to fit to it a line\par 50 | or a curve, but you don't really know what degree polynomial\par 51 | to fit; you know, do you wanna fit a first degree,\par 52 | just a linear line or second or third or fourth or fifth?\par 53 | That's a very good question.\par 54 | You don't really know what would work best,\par 55 | so how do you decide?\par 56 | Okay, so there are two phenomena that appear\par 57 | when you try to do such a thing.\par 58 | One is called underfit, and that's what we saw\par 59 | before with the straight line.\par 60 | The straight line was not rich enough.\par 61 | It was not flexible enough in order to fit the data;\par 62 | so it underfit the data.\par 63 | And then there is the opposite problem of overfit.\par 64 | So you use a model and it fits the data too well.\par 65 | That sounds strange; what do I mean by\par 66 | it fits the data too well?\par 67 | Why is this a problem?\par 68 | It's a problem because we're not really interested always\par 69 | in just fitting the data that we see,\par 70 | we want to also fit new data that we haven't see yet\par 71 | that comes from the same distribution.\par 72 | So that leads to the concept of\par 73 | training error and test error.\par 74 | What we do is we take the data that we have,\par 75 | and we randomly partition it into two parts, okay?\par 76 | The two parts are essentially\par 77 | statistically equivalent, but they are disjoint.\par 78 | Each example is either in the training set\par 79 | or in the test set, and we use the training set\par 80 | to find the best polynomial; and then we use the test set\par 81 | to test that polynomial that we found on new data.\par 82 | And now we can express what we actually mean by overfitting.\par 83 | If we increase the degree of the polynomial,\par 84 | the training error, the error that we get\par 85 | on the training data, will always decrease.\par 86 | It just keeps decreasing because the polynomials are more\par 87 | and more flexible as you make them higher and higher degree;\par 88 | but if you look at the test error, you'll see that initially\par 89 | it behaves like the training error, but at some point,\par 90 | the training error continues to decrease\par 91 | and the test error will start to increase\par 92 | because we're overfitting the training error\par 93 | and then we're performing badly on the test data.\par 94 | Okay, so this increase is called overfitting.\par 95 | That's what we mean when we say overfitting.\par 96 | So we're going to use a simple data, data set\par 97 | to analyze this and here is the data set;\par 98 | and we see that it has a general tendency going up,\par 99 | and we might think that maybe it has,\par 100 | maybe a straight line would fit it well\par 101 | or maybe a polynomial would, sorry, a parabola\par 102 | would fit it well, a second degree polynomial.\par 103 | But we don't know, so let's see what we can do.\par 104 | Okay, so we split the data into training set and test set,\par 105 | as I said before; and now let's say that we fit\par 106 | degree three polynomial to the training sector.\par 107 | Okay, we get this thing; and what you see is that\par 108 | this second, third degree polynomial fits the blue dots\par 109 | very well because that's the training data.\par 110 | But it doesn't really fit the red dots very well.\par 111 | Okay, so, it doesn't perform well on the test data;\par 112 | and we can see that in the numbers up here the training\par 113 | root mean square error is 0.04\par 114 | and the test root mean square error is 0.5, okay?\par 115 | So the test root mean square error is significantly bigger\par 116 | than the train, and it seems like we're overfitting.\par 117 | But we can only really judge it if we try a bunch\par 118 | of degrees for the polynomials and see what fits best.\par 119 | So we're going to do that.\par 120 | We're going to fit all degrees from zero to five.\par 121 | So here they are; and what is degrees zero?\par 122 | Degree zero is basically a constant.\par 123 | We basically fit the whole data with the constant,\par 124 | and the constant turns out to be simply the mean.\par 125 | Okay, so with the mean, we have very poor fit.\par 126 | It's definitely underfitting, and it's,\par 127 | and the performance is pretty bad\par 128 | on both training and testing.\par 129 | When we go to first degree, we see that there is a nice fit\par 130 | to the training data, not perfect;\par 131 | but in the root mean square for the training data is 0.16,\par 132 | and the root mean square for the test data is 0/22.\par 133 | So higher than the training error, but not very high.\par 134 | Once we go to degree two, we see that the training error\par 135 | decreased somewhat; and the test error increased.\par 136 | And when we go to degree three, like we saw before,\par 137 | the training data, the training error decreases even more;\par 138 | but the test data increases very significantly.\par 139 | And what happens at this point is that once you have\par 140 | a degree four polynomial, you can fit the data perfectly.\par 141 | Right, so you can simply go through all of the points;\par 142 | and that's great training error,\par 143 | but the test error is very, very high.\par 144 | Okay, so from that we basically see\par 145 | that the best degree polynomial to choose\par 146 | is the first degree polynomial, the straight line.\par 147 | That was our intuition too, but it's kind of hard\par 148 | to depend on intuition when you have data\par 149 | that is very high dimensional or very large.\par 150 | So what we saw is that the minimum\par 151 | of the root mean square occurred of the equal one.\par 152 | And here is the graph that shows it.\par 153 | We see that the, that the training error keeps going down.\par 154 | The horizontal is a degree and this is\par 155 | the root mean square error; and if we look\par 156 | at the test error, it goes down to one,\par 157 | but then it starts to increase and increase.\par 158 | Okay, so that tells us that the minimum point here,\par 159 | the best model to use is simply a linear function,\par 160 | degree one polynomial.\par 161 | So these concepts that I just introduced,\par 162 | they're very central to statistics\par 163 | and to machine learning in general.\par 164 | So this is just kind of a first dip into them,\par 165 | and you will Learn much more about them\par 166 | when you go into the machine learning course.\par 167 | And in the next video, we'll talk about another\par 168 | related subject, which is principle component analysis.\par 169 | Thank you.\par 170 | End of transcript. Skip to the start.\par 171 | Previous\par 172 | } 173 | -------------------------------------------------------------------------------- /Week 9 Regression and PCA/6. Regression Towards the Mean.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - In this video, I want to tell you about something\par 4 | that is an interesting and curious phenomenon,\par 5 | and it is something that shows that statistics can\par 6 | sometimes result in situations that are unintuitive.\par 7 | And it's a very common thing, so it's worth knowing about.\par 8 | It's called regression to the mean.\par 9 | So, here's an example.\par 10 | Suppose students take a standard exam at two time points,\par 11 | maybe before and after a quarter,\par 12 | and they get two grades.\par 13 | The grades are, let's say, on a scale of 0-100,\par 14 | and the average grade is 70.\par 15 | Here is what tends to happen.\par 16 | Suppose that the first grade that the student got is 97.\par 17 | Then the second grade that the student gets\par 18 | is likely to be lower than 97.\par 19 | And the opposite is also true.\par 20 | If the first grade the student got is 55,\par 21 | then the second grade is likely to be higher than 55.\par 22 | So, remember, the mean is 70, and this is what we call\par 23 | regression to the mean, because you're moving,\par 24 | and the second case, you're moving closer to the mean.\par 25 | But the strange thing is that this has nothing to do\par 26 | with whether the student learned or not.\par 27 | It just happens, even if the two grades\par 28 | are completely statistically independent.\par 29 | And also, it happens if you exchange grade 1 and grade 2.\par 30 | So you say, what is grade 1,\par 31 | given that, in grade 2, you got 97?\par 32 | It will tend to be smaller than 97.\par 33 | So, it doesn't really tell us anything.\par 34 | It doesn't tell us that those students\par 35 | that were good initially became bad,\par 36 | or those that were bad initially became good.\par 37 | It is just something that happens because of statistics.\par 38 | It doesn't tell us anything interesting,\par 39 | and we can't conclude anything from that.\par 40 | So, here is one of the first works\par 41 | that showed this.\par 42 | It was a work comparing fathers' and sons' height,\par 43 | and it was done by Sir Francis Galton\par 44 | in 1886, so quite awhile ago.\par 45 | And basically, it says that extreme characteristics,\par 46 | let's say height in parents, are not passed on\par 47 | completely to their offspring.\par 48 | So, you'd think tall parents\par 49 | tend to have tall children, and it's true,\par 50 | but the children would tend to be less tall.\par 51 | And so, here is some data that\par 52 | was actually used to study this.\par 53 | What you have on the x-axis is the height of the father,\par 54 | and what you have on the y-axis is the height of the son.\par 55 | And what you see is that there is\par 56 | definitely relationship between them.\par 57 | As the height of the father increases,\par 58 | the height of the son increases.\par 59 | But if you look at the actual values,\par 60 | you see that the height of the parent can go from,\par 61 | let's say, 58"-77",\par 62 | and the related son height will\par 63 | just go from 63"-72", so it's much closer\par 64 | to the mean height.\par 65 | A different way to look at it that is useful\par 66 | is to look really at the difference.\par 67 | So, instead of looking at the absolute value\par 68 | of the height of the son, you look at\par 69 | what's the difference between the height of the son\par 70 | and the height of the parent.\par 71 | So, you get this kind of plot.\par 72 | So, in this plot, it is even more striking\par 73 | that the higher the parent is,\par 74 | the shorter, relatively to the parent, the son is.\par 75 | And again, that has nothing to do with whether\par 76 | there is like some kind of limitation\par 77 | in the genetics or so on.\par 78 | It's simply a result of regression\par 79 | to the mean that always happens,\par 80 | even if things are completely independent.\par 81 | So, this is the conclusion.\par 82 | Suppose you have a pair of independent and identically\par 83 | distributed random variables, X1 and X2.\par 84 | If you select pairs X1, X2, such that X1 is\par 85 | far from the mean of the distribution,\par 86 | then X2, for those pairs, will\par 87 | tend to be closer to the mean.\par 88 | And you can do the same if you choose\par 89 | X2 to be far from the mean.\par 90 | You'll get X1 is closer to the mean.\par 91 | So, next time, we're going to talk\par 92 | about principle component analysis.\par 93 | End of transcript. Skip to the start.\par 94 | Previous\par 95 | } 96 | -------------------------------------------------------------------------------- /Week 9 Regression and PCA/7. Components Analysis.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1251\deff0\nouicompat\deflang1058{\fonttbl{\f0\fnil\fcharset0 Calibri;}} 2 | {\*\generator Riched20 10.0.16299}\viewkind4\uc1 3 | \pard\sa200\sl276\slmult1\f0\fs22\lang9 - We are reaching the end of the topic of regression.\par 4 | And the last thing I want to tell you about\par 5 | is a subject called principal components analysis.\par 6 | So, the subject of principal component analysis,\par 7 | to actually fully understand it,\par 8 | you need a good understanding of eigenvectors,\par 9 | eigenvalues, matrix decomposition, and so on.\par 10 | And I'm not going to provide those.\par 11 | I'm just going to give you a somewhat more superficial\par 12 | understanding, but that still gives you\par 13 | some of the intuition about how PCA works.\par 14 | So let's start with a quick review of linear regression.\par 15 | Suppose we have nine points in the plane like that.\par 16 | And here is a plot of these points.\par 17 | And you see that they are not exactly\par 18 | on a straight line, but they kind of tend\par 19 | to go up from the left to the right.\par 20 | So Y tends to increase as X increases.\par 21 | So, we would like to find a line\par 22 | that would represent it.\par 23 | And what we're going to do is look\par 24 | for a line with the formula w zero plus w one times x.\par 25 | And the goal is really to find these two parameters,\par 26 | w zero and w one.\par 27 | And this is actually very easy using NumPy.\par 28 | We just use the least square function\par 29 | and we get the w's that we wanted.\par 30 | And when we plot them, we see this line.\par 31 | And what I added here are the green lines\par 32 | that basically indicate the amount of error\par 33 | associated with each point.\par 34 | And what we're trying to basically do\par 35 | is minimize the square of the lengths of these segments.\par 36 | So in the regression problem,\par 37 | we looked at the function that predicts y from x.\par 38 | Okay, and we saw that if we try to do the opposite,\par 39 | we predict x from y, we get actually a different function,\par 40 | a different line.\par 41 | So it matters if we go this way or in the reverse.\par 42 | In general, we call this kind of problem\par 43 | in machine learning, we call it supervised learning.\par 44 | Why?\par 45 | Because the idea is that the output\par 46 | or the thing that we're trying to predict\par 47 | is labeled by some supervisor,\par 48 | somebody that knows what is the correct value of y,\par 49 | and then we are just trying to predict\par 50 | that value of y.\par 51 | Okay?\par 52 | But we can also fit the line\par 53 | without deciding on a direction.\par 54 | So, there is a way to fit a line to this data\par 55 | that doesn't have anything to do\par 56 | with whether we choose x or we choose y\par 57 | or we rotate the whole thing any way we want.\par 58 | So this is called unsupervised learning\par 59 | because here we're just basically given data\par 60 | and nobody identifies a particular component\par 61 | as something that we're trying to predict.\par 62 | So if we want to do unsupervised learning\par 63 | for a linear function by using squared error,\par 64 | this is called PCA, principal component analysis.\par 65 | So both principal component analysis\par 66 | and regression minimize the same loss function,\par 67 | the root mean squared error.\par 68 | But the definitions of the error are different\par 69 | as I'll show you in the next figure.\par 70 | So let me make this a little larger\par 71 | so you can see more clearly.\par 72 | What we see here is the black line\par 73 | is the regression line.\par 74 | And the errors to the regression line\par 75 | are basically the vertical, these green vertical segments.\par 76 | On the other hand, the red line is the PCA solution,\par 77 | and the errors for the PCA solution\par 78 | are not vertical but basically they're orthogonal\par 79 | to the line itself.\par 80 | So, basically the error for this point\par 81 | is this blue line here.\par 82 | And what we see is that when we try\par 83 | to minimize this kind of error,\par 84 | then basically you see that we get a different result.\par 85 | The red line is not the same as the black line.\par 86 | And importantly, if we think about rotating\par 87 | this coordinate system, we will see\par 88 | that the regression result will change\par 89 | because we are changing the relationship\par 90 | between x and y but the PCA result will not change.\par 91 | It is somehow directly associated with the data\par 92 | and not with the coordinate system.\par 93 | So again, errors for the black regression line\par 94 | correspond to the vertical green segment\par 95 | and errors for the red PCA line\par 96 | correspond to the blue segments\par 97 | that are orthogonal to the red line.\par 98 | An alternative way to think about PCA\par 99 | is about maximizing variance.\par 100 | So suppose that we have a set of vectors\par 101 | x one to x n, and then we take a unit vector, u,\par 102 | if you remember from the linear algebra review,\par 103 | a unit vector that has length one\par 104 | and we take the dot product of this u\par 105 | with each one of the vectors,\par 106 | then we get a number.\par 107 | And then we can calculate the mean of that number\par 108 | and the variance of the number.\par 109 | And we're particularly interested in the variance,\par 110 | how much spread out is the data along this projection.\par 111 | So here is one example.\par 112 | We're projecting on this red line,\par 113 | and we see that the points fall pretty far\par 114 | away from the mean.\par 115 | On the other hand, if we use this direction,\par 116 | we see that the points fall closer to the mean.\par 117 | So the points in this direction\par 118 | are more bunched together and the variance is smaller.\par 119 | So suppose you consider all possible directions?\par 120 | In the two-dimension case it's very simple.\par 121 | It's all directions from zero to 360 degrees.\par 122 | And for each direction, we compute the standard deviation,\par 123 | so the square root of the variance.\par 124 | And we put a point that distance away\par 125 | from the mean, from the origin point\par 126 | which is the mean.\par 127 | So the collection of all of these points\par 128 | when we draw them out will form an ellipse,\par 129 | something like this.\par 130 | So basically, if you go from this point\par 131 | and you project along this line,\par 132 | then the standard deviation is big.\par 133 | And if you go orthogonal to that,\par 134 | the standard deviation is small.\par 135 | So this ellipse essentially represents\par 136 | all the information that exists\par 137 | in the variance of the projections.\par 138 | So PCA relates to this picture in the following way.\par 139 | The larger axis of the ellipse corresponds\par 140 | to the direction of maximum variance.\par 141 | And that is what's called the first eigenvector\par 142 | of the principal component analysis.\par 143 | So that is the direction that gives you\par 144 | the maximum variance.\par 145 | The smaller axis of this ellipse\par 146 | corresponds to the direction of minimum variance.\par 147 | So that gives you the second eigenvector\par 148 | which is orthogonal to the first one.\par 149 | So the nice thing with this is\par 150 | that it's not really restricted\par 151 | just to two dimensions.\par 152 | You can do it in very high dimensions.\par 153 | And what you get is that the direction\par 154 | that gives you the highest variance\par 155 | is the first eigenvector, and the second direction\par 156 | is the second eigenvector,\par 157 | so the second direction in which you get maximum variance\par 158 | but it is orthogonal to the first direction and so on,\par 159 | you go down.\par 160 | And basically, that kind of describes\par 161 | a big ellipsoid in space that basically represents\par 162 | somehow the distribution of the data.\par 163 | So let's see a real-life example of that\par 164 | right here just in two dimensions.\par 165 | So we're going back to the data\par 166 | that has the weight and the height of 25,000 people.\par 167 | And here is the first component of the PCA analysis.\par 168 | Okay, so this is the direction\par 169 | that gives you the highest variance\par 170 | when you project on it, when you project the data on it.\par 171 | So it's pretty intuitive.\par 172 | This is the direction the data is most,\par 173 | it's most distributed widely across this direction.\par 174 | Okay, so this way of looking at PCA\par 175 | provides one of the common ways to normalize data.\par 176 | Normalizing data is very useful\par 177 | because it puts some of the variation\par 178 | into some parameters and then leaves\par 179 | the rest of the variation to be studied.\par 180 | So how do we use PCA to normalize data?\par 181 | First, we subtract the mean.\par 182 | So by subtracting the mean, we make the new mean zero.\par 183 | And then we rotate the data so that\par 184 | the coordinates are the eigenvectors.\par 185 | So this looks something like this.\par 186 | If we have the original data here,\par 187 | the mean is somewhere between 125 and 70.\par 188 | We subtracted the mean, so now the mean is zero, zero.\par 189 | And then we rotate it so that the maximum variation\par 190 | is along the x-axis and the smaller variation\par 191 | is along the y-axis.\par 192 | So this is something that is done quite a lot.\par 193 | Okay, let's see another little application\par 194 | of PCA in computer vision.\par 195 | So here is a little picture of maybe a blob\par 196 | and an image.\par 197 | So it's a rectangle.\par 198 | And let's say we want to somehow capture\par 199 | what is the size and orientation of this rectangle.\par 200 | So we can basically map it to our PCA problem.\par 201 | Here are all the pixels, those little blue dots.\par 202 | And if we do a PCA, we find that the eigenvector\par 203 | is essentially in this direction.\par 204 | And it has, the standard deviation is about this,\par 205 | or this is actually some product,\par 206 | some constant factor over the standard deviation.\par 207 | But it tells us the data varies more.\par 208 | The direction of this blob is mostly in this direction,\par 209 | and this is about its size.\par 210 | And that would work for any shape.\par 211 | It doesn't really matter.\par 212 | Okay, so to summarize,\par 213 | we talked about PCA and regression.\par 214 | And they are both models for,\par 215 | both ways to model data by minimizing\par 216 | root mean squared error.\par 217 | Regression is a supervised method,\par 218 | so you have to choose what thing\par 219 | you're trying to predict,\par 220 | while principal component analysis\par 221 | is an unsupervised method.\par 222 | So unsupervised methods tend to be used\par 223 | before when you just get your raw data\par 224 | and you are trying to summarize it\par 225 | or somehow reduce its dimension\par 226 | so that you can do the supervised part more efficiently.\par 227 | Both methods are based on linear algebra.\par 228 | And because of that, they are very, very efficient methods.\par 229 | They are far more efficient than methods\par 230 | that depend on gradient descent and so on.\par 231 | Okay, so this is the end of the regression topic.\par 232 | I hope you found it interesting.\par 233 | And we'll continue next week.\par 234 | End of transcript. Skip to the start.\par 235 | Previous\par 236 | } 237 | -------------------------------------------------------------------------------- /Week 9 Regression and PCA/HW_9.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 9 Regression and PCA/HW_9.zip -------------------------------------------------------------------------------- /Week 9 Regression and PCA/Programming Assignment.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 9 Regression and PCA/Programming Assignment.pdf -------------------------------------------------------------------------------- /Week 9 Regression and PCA/Quiz 9.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 9 Regression and PCA/Quiz 9.pdf -------------------------------------------------------------------------------- /Week 9 Regression and PCA/lectures.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 9 Regression and PCA/lectures.zip -------------------------------------------------------------------------------- /Week 9 Regression and PCA/more_lectures.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VSerpak/DSE210x-Statistics-and-Probability-in-Data-Science-using-Python/a5dc913864f6ad5666764d8736f0aa373b75dc94/Week 9 Regression and PCA/more_lectures.zip --------------------------------------------------------------------------------